diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 484986bde719..b66440c8f9f8 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -1,9612 +1,9626 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2012 Milan Jurik. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2016 Igor Kozhukhov . * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, loli10K * Copyright 2019 Joyent, Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_IDMAP #include #include #endif /* HAVE_IDMAP */ #include "zfs_iter.h" #include "zfs_util.h" #include "zfs_comutil.h" #include "zfs_projectutil.h" libzfs_handle_t *g_zfs; static char history_str[HIS_MAX_RECORD_LEN]; static boolean_t log_history = B_TRUE; static int zfs_do_clone(int argc, char **argv); static int zfs_do_create(int argc, char **argv); static int zfs_do_destroy(int argc, char **argv); static int zfs_do_get(int argc, char **argv); static int zfs_do_inherit(int argc, char **argv); static int zfs_do_list(int argc, char **argv); static int zfs_do_mount(int argc, char **argv); static int zfs_do_rename(int argc, char **argv); static int zfs_do_rollback(int argc, char **argv); static int zfs_do_set(int argc, char **argv); static int zfs_do_upgrade(int argc, char **argv); static int zfs_do_snapshot(int argc, char **argv); static int zfs_do_unmount(int argc, char **argv); static int zfs_do_share(int argc, char **argv); static int zfs_do_unshare(int argc, char **argv); static int zfs_do_send(int argc, char **argv); static int zfs_do_receive(int argc, char **argv); static int zfs_do_promote(int argc, char **argv); static int zfs_do_userspace(int argc, char **argv); static int zfs_do_allow(int argc, char **argv); static int zfs_do_unallow(int argc, char **argv); static int zfs_do_hold(int argc, char **argv); static int zfs_do_holds(int argc, char **argv); static int zfs_do_release(int argc, char **argv); static int zfs_do_diff(int argc, char **argv); static int zfs_do_bookmark(int argc, char **argv); static int zfs_do_channel_program(int argc, char **argv); static int zfs_do_load_key(int argc, char **argv); static int zfs_do_unload_key(int argc, char **argv); static int zfs_do_change_key(int argc, char **argv); static int zfs_do_project(int argc, char **argv); static int zfs_do_version(int argc, char **argv); static int zfs_do_redact(int argc, char **argv); static int zfs_do_rewrite(int argc, char **argv); static int zfs_do_wait(int argc, char **argv); #ifdef __FreeBSD__ static int zfs_do_jail(int argc, char **argv); static int zfs_do_unjail(int argc, char **argv); #endif #ifdef __linux__ static int zfs_do_zone(int argc, char **argv); static int zfs_do_unzone(int argc, char **argv); #endif static int zfs_do_help(int argc, char **argv); enum zfs_options { ZFS_OPTION_JSON_NUMS_AS_INT = 1024 }; /* * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. */ #ifdef DEBUG const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } #endif typedef enum { HELP_CLONE, HELP_CREATE, HELP_DESTROY, HELP_GET, HELP_INHERIT, HELP_UPGRADE, HELP_LIST, HELP_MOUNT, HELP_PROMOTE, HELP_RECEIVE, HELP_RENAME, HELP_ROLLBACK, HELP_SEND, HELP_SET, HELP_SHARE, HELP_SNAPSHOT, HELP_UNMOUNT, HELP_UNSHARE, HELP_ALLOW, HELP_UNALLOW, HELP_USERSPACE, HELP_GROUPSPACE, HELP_PROJECTSPACE, HELP_PROJECT, HELP_HOLD, HELP_HOLDS, HELP_RELEASE, HELP_DIFF, HELP_BOOKMARK, HELP_CHANNEL_PROGRAM, HELP_LOAD_KEY, HELP_UNLOAD_KEY, HELP_CHANGE_KEY, HELP_VERSION, HELP_REDACT, HELP_REWRITE, HELP_JAIL, HELP_UNJAIL, HELP_WAIT, HELP_ZONE, HELP_UNZONE, } zfs_help_t; typedef struct zfs_command { const char *name; int (*func)(int argc, char **argv); zfs_help_t usage; } zfs_command_t; /* * Master command table. Each ZFS command has a name, associated function, and * usage message. The usage messages need to be internationalized, so we have * to have a function to return the usage message based on a command index. * * These commands are organized according to how they are displayed in the usage * message. An empty command (one with a NULL name) indicates an empty line in * the generic usage message. */ static zfs_command_t command_table[] = { { "version", zfs_do_version, HELP_VERSION }, { NULL }, { "create", zfs_do_create, HELP_CREATE }, { "destroy", zfs_do_destroy, HELP_DESTROY }, { NULL }, { "snapshot", zfs_do_snapshot, HELP_SNAPSHOT }, { "rollback", zfs_do_rollback, HELP_ROLLBACK }, { "clone", zfs_do_clone, HELP_CLONE }, { "promote", zfs_do_promote, HELP_PROMOTE }, { "rename", zfs_do_rename, HELP_RENAME }, { "bookmark", zfs_do_bookmark, HELP_BOOKMARK }, { "diff", zfs_do_diff, HELP_DIFF }, { NULL }, { "list", zfs_do_list, HELP_LIST }, { NULL }, { "set", zfs_do_set, HELP_SET }, { "get", zfs_do_get, HELP_GET }, { "inherit", zfs_do_inherit, HELP_INHERIT }, { "upgrade", zfs_do_upgrade, HELP_UPGRADE }, { NULL }, { "userspace", zfs_do_userspace, HELP_USERSPACE }, { "groupspace", zfs_do_userspace, HELP_GROUPSPACE }, { "projectspace", zfs_do_userspace, HELP_PROJECTSPACE }, { NULL }, { "project", zfs_do_project, HELP_PROJECT }, { NULL }, { "mount", zfs_do_mount, HELP_MOUNT }, { "unmount", zfs_do_unmount, HELP_UNMOUNT }, { "share", zfs_do_share, HELP_SHARE }, { "unshare", zfs_do_unshare, HELP_UNSHARE }, { NULL }, { "send", zfs_do_send, HELP_SEND }, { "receive", zfs_do_receive, HELP_RECEIVE }, { "redact", zfs_do_redact, HELP_REDACT }, { NULL }, { "allow", zfs_do_allow, HELP_ALLOW }, { "unallow", zfs_do_unallow, HELP_UNALLOW }, { NULL }, { "hold", zfs_do_hold, HELP_HOLD }, { "holds", zfs_do_holds, HELP_HOLDS }, { "release", zfs_do_release, HELP_RELEASE }, { NULL }, { "load-key", zfs_do_load_key, HELP_LOAD_KEY }, { "unload-key", zfs_do_unload_key, HELP_UNLOAD_KEY }, { "change-key", zfs_do_change_key, HELP_CHANGE_KEY }, { NULL }, { "program", zfs_do_channel_program, HELP_CHANNEL_PROGRAM }, { "rewrite", zfs_do_rewrite, HELP_REWRITE }, { "wait", zfs_do_wait, HELP_WAIT }, #ifdef __FreeBSD__ { NULL }, { "jail", zfs_do_jail, HELP_JAIL }, { "unjail", zfs_do_unjail, HELP_UNJAIL }, #endif #ifdef __linux__ { NULL }, { "zone", zfs_do_zone, HELP_ZONE }, { "unzone", zfs_do_unzone, HELP_UNZONE }, #endif }; #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) #define MAX_CMD_LEN 256 zfs_command_t *current_command; static const char * get_usage(zfs_help_t idx) { switch (idx) { case HELP_CLONE: return (gettext("\tclone [-p] [-o property=value] ... " " \n")); case HELP_CREATE: return (gettext("\tcreate [-Pnpuv] [-o property=value] ... " "\n" "\tcreate [-Pnpsv] [-b blocksize] [-o property=value] ... " "-V \n")); case HELP_DESTROY: return (gettext("\tdestroy [-fnpRrv] \n" "\tdestroy [-dnpRrv] " "@[%][,...]\n" "\tdestroy #\n")); case HELP_GET: return (gettext("\tget [-rHp] [-j [--json-int]] [-d max] " "[-o \"all\" | field[,...]]\n" "\t [-t type[,...]] [-s source[,...]]\n" "\t <\"all\" | property[,...]> " "[filesystem|volume|snapshot|bookmark] ...\n")); case HELP_INHERIT: return (gettext("\tinherit [-rS] " " ...\n")); case HELP_UPGRADE: return (gettext("\tupgrade [-v]\n" "\tupgrade [-r] [-V version] <-a | filesystem ...>\n")); case HELP_LIST: return (gettext("\tlist [-Hp] [-j [--json-int]] [-r|-d max] " "[-o property[,...]] [-s property]...\n\t " "[-S property]... [-t type[,...]] " "[filesystem|volume|snapshot] ...\n")); case HELP_MOUNT: return (gettext("\tmount [-j]\n" "\tmount [-flvO] [-o opts] <-a|-R filesystem|" "filesystem>\n")); case HELP_PROMOTE: return (gettext("\tpromote \n")); case HELP_RECEIVE: return (gettext("\treceive [-vMnsFhu] " "[-o =] ... [-x ] ...\n" "\t \n" "\treceive [-vMnsFhu] [-o =] ... " "[-x ] ... \n" "\t [-d | -e] \n" "\treceive -A \n")); case HELP_RENAME: return (gettext("\trename [-f] " "\n" "\trename -p [-f] \n" "\trename -u [-f] \n" "\trename -r \n")); case HELP_ROLLBACK: return (gettext("\trollback [-rRf] \n")); case HELP_SEND: return (gettext("\tsend [-DLPbcehnpsVvw] " "[-i|-I snapshot]\n" "\t [-R [-X dataset[,dataset]...]] \n" "\tsend [-DnVvPLecw] [-i snapshot|bookmark] " "\n" "\tsend [-DnPpVvLec] [-i bookmark|snapshot] " "--redact \n" "\tsend [-nVvPe] -t \n" "\tsend [-PnVv] --saved filesystem\n")); case HELP_SET: return (gettext("\tset [-u] ... " " ...\n")); case HELP_SHARE: return (gettext("\tshare [-l] <-a [nfs|smb] | filesystem>\n")); case HELP_SNAPSHOT: return (gettext("\tsnapshot [-r] [-o property=value] ... " "@ ...\n")); case HELP_UNMOUNT: return (gettext("\tunmount [-fu] " "<-a | filesystem|mountpoint>\n")); case HELP_UNSHARE: return (gettext("\tunshare " "<-a [nfs|smb] | filesystem|mountpoint>\n")); case HELP_ALLOW: return (gettext("\tallow \n" "\tallow [-ldug] " "<\"everyone\"|user|group>[,...] [,...]\n" "\t \n" "\tallow [-ld] -e [,...] " "\n" "\tallow -c [,...] \n" "\tallow -s @setname [,...] " "\n")); case HELP_UNALLOW: return (gettext("\tunallow [-rldug] " "<\"everyone\"|user|group>[,...]\n" "\t [[,...]] \n" "\tunallow [-rld] -e [[,...]] " "\n" "\tunallow [-r] -c [[,...]] " "\n" "\tunallow [-r] -s @setname [[,...]] " "\n")); case HELP_USERSPACE: return (gettext("\tuserspace [-Hinp] [-o field[,...]] " "[-s field] ...\n" "\t [-S field] ... [-t type[,...]] " "\n")); case HELP_GROUPSPACE: return (gettext("\tgroupspace [-Hinp] [-o field[,...]] " "[-s field] ...\n" "\t [-S field] ... [-t type[,...]] " "\n")); case HELP_PROJECTSPACE: return (gettext("\tprojectspace [-Hp] [-o field[,...]] " "[-s field] ... \n" "\t [-S field] ... \n")); case HELP_PROJECT: return (gettext("\tproject [-d|-r] \n" "\tproject -c [-0] [-d|-r] [-p id] \n" "\tproject -C [-k] [-r] \n" "\tproject [-p id] [-r] [-s] \n")); case HELP_HOLD: return (gettext("\thold [-r] ...\n")); case HELP_HOLDS: return (gettext("\tholds [-rHp] ...\n")); case HELP_RELEASE: return (gettext("\trelease [-r] ...\n")); case HELP_DIFF: return (gettext("\tdiff [-FHth] " "[snapshot|filesystem]\n")); case HELP_BOOKMARK: return (gettext("\tbookmark " "\n")); case HELP_CHANNEL_PROGRAM: return (gettext("\tprogram [-jn] [-t ] " "[-m ]\n" "\t [lua args...]\n")); case HELP_LOAD_KEY: return (gettext("\tload-key [-rn] [-L ] " "<-a | filesystem|volume>\n")); case HELP_UNLOAD_KEY: return (gettext("\tunload-key [-r] " "<-a | filesystem|volume>\n")); case HELP_CHANGE_KEY: return (gettext("\tchange-key [-l] [-o keyformat=]\n" "\t [-o keylocation=] [-o pbkdf2iters=]\n" "\t \n" "\tchange-key -i [-l] \n")); case HELP_VERSION: return (gettext("\tversion [-j]\n")); case HELP_REDACT: return (gettext("\tredact " " ...\n")); case HELP_REWRITE: return (gettext("\trewrite [-Prvx] [-o ] [-l ] " "\n")); case HELP_JAIL: return (gettext("\tjail \n")); case HELP_UNJAIL: return (gettext("\tunjail \n")); case HELP_WAIT: return (gettext("\twait [-t ] \n")); case HELP_ZONE: return (gettext("\tzone \n")); case HELP_UNZONE: return (gettext("\tunzone \n")); default: __builtin_unreachable(); } } void nomem(void) { (void) fprintf(stderr, gettext("internal error: out of memory\n")); exit(1); } /* * Utility function to guarantee malloc() success. */ void * safe_malloc(size_t size) { void *data; if ((data = calloc(1, size)) == NULL) nomem(); return (data); } static void * safe_realloc(void *data, size_t size) { void *newp; if ((newp = realloc(data, size)) == NULL) { free(data); nomem(); } return (newp); } static char * safe_strdup(const char *str) { char *dupstr = strdup(str); if (dupstr == NULL) nomem(); return (dupstr); } /* * Callback routine that will print out information for each of * the properties. */ static int usage_prop_cb(int prop, void *cb) { FILE *fp = cb; (void) fprintf(fp, "\t%-22s ", zfs_prop_to_name(prop)); if (zfs_prop_readonly(prop)) (void) fprintf(fp, " NO "); else (void) fprintf(fp, "YES "); if (zfs_prop_inheritable(prop)) (void) fprintf(fp, " YES "); else (void) fprintf(fp, " NO "); (void) fprintf(fp, "%s\n", zfs_prop_values(prop) ?: "-"); return (ZPROP_CONT); } /* * Display usage message. If we're inside a command, display only the usage for * that command. Otherwise, iterate over the entire command table and display * a complete usage message. */ static __attribute__((noreturn)) void usage(boolean_t requested) { int i; boolean_t show_properties = B_FALSE; FILE *fp = requested ? stdout : stderr; if (current_command == NULL) { (void) fprintf(fp, gettext("usage: zfs command args ...\n")); (void) fprintf(fp, gettext("where 'command' is one of the following:\n\n")); for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) (void) fprintf(fp, "\n"); else (void) fprintf(fp, "%s", get_usage(command_table[i].usage)); } (void) fprintf(fp, gettext("\nEach dataset is of the form: " "pool/[dataset/]*dataset[@name]\n")); } else { (void) fprintf(fp, gettext("usage:\n")); (void) fprintf(fp, "%s", get_usage(current_command->usage)); } if (current_command != NULL && (strcmp(current_command->name, "set") == 0 || strcmp(current_command->name, "get") == 0 || strcmp(current_command->name, "inherit") == 0 || strcmp(current_command->name, "list") == 0)) show_properties = B_TRUE; if (show_properties) { (void) fprintf(fp, "%s", gettext("\nThe following properties are supported:\n")); (void) fprintf(fp, "\n\t%-21s %s %s %s\n\n", "PROPERTY", "EDIT", "INHERIT", "VALUES"); /* Iterate over all properties */ (void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_DATASET); (void) fprintf(fp, "\t%-22s ", "userused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-22s ", "groupused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-22s ", "projectused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-22s ", "userobjused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-22s ", "groupobjused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-22s ", "projectobjused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-22s ", "userquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-22s ", "groupquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-22s ", "projectquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-22s ", "userobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-22s ", "groupobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-22s ", "projectobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-22s ", "written@"); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-22s ", "written#"); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, gettext("\nSizes are specified in bytes " "with standard units such as K, M, G, etc.\n")); (void) fprintf(fp, "%s", gettext("\nUser-defined properties " "can be specified by using a name containing a colon " "(:).\n")); (void) fprintf(fp, gettext("\nThe {user|group|project}" "[obj]{used|quota}@ properties must be appended with\n" "a user|group|project specifier of one of these forms:\n" " POSIX name (eg: \"matt\")\n" " POSIX id (eg: \"126829\")\n" " SMB name@domain (eg: \"matt@sun\")\n" " SMB SID (eg: \"S-1-234-567-89\")\n")); } else { (void) fprintf(fp, gettext("\nFor the property list, run: %s\n"), "zfs set|get"); (void) fprintf(fp, gettext("\nFor the delegated permission list, run: %s\n"), "zfs allow|unallow"); (void) fprintf(fp, gettext("\nFor further help on a command or topic, " "run: %s\n"), "zfs help []"); } /* * See comments at end of main(). */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } exit(requested ? 0 : 2); } /* * Take a property=value argument string and add it to the given nvlist. * Modifies the argument inplace. */ static boolean_t parseprop(nvlist_t *props, char *propname) { char *propval; if ((propval = strchr(propname, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for property=value argument\n")); return (B_FALSE); } *propval = '\0'; propval++; if (nvlist_exists(props, propname)) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (B_FALSE); } if (nvlist_add_string(props, propname, propval) != 0) nomem(); return (B_TRUE); } /* * Take a property name argument and add it to the given nvlist. * Modifies the argument inplace. */ static boolean_t parsepropname(nvlist_t *props, char *propname) { if (strchr(propname, '=') != NULL) { (void) fprintf(stderr, gettext("invalid character " "'=' in property argument\n")); return (B_FALSE); } if (nvlist_exists(props, propname)) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (B_FALSE); } if (nvlist_add_boolean(props, propname) != 0) nomem(); return (B_TRUE); } static int parse_depth(char *opt, int *flags) { char *tmp; int depth; depth = (int)strtol(opt, &tmp, 0); if (*tmp) { (void) fprintf(stderr, gettext("%s is not an integer\n"), optarg); usage(B_FALSE); } if (depth < 0) { (void) fprintf(stderr, gettext("Depth can not be negative.\n")); usage(B_FALSE); } *flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE); return (depth); } #define PROGRESS_DELAY 2 /* seconds */ static const char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"; static time_t pt_begin; static char *pt_header = NULL; static boolean_t pt_shown; static void start_progress_timer(void) { pt_begin = time(NULL) + PROGRESS_DELAY; pt_shown = B_FALSE; } static void set_progress_header(const char *header) { assert(pt_header == NULL); pt_header = safe_strdup(header); if (pt_shown) { (void) printf("%s: ", header); (void) fflush(stdout); } } static void update_progress(const char *update) { if (!pt_shown && time(NULL) > pt_begin) { int len = strlen(update); (void) printf("%s: %s%*.*s", pt_header, update, len, len, pt_reverse); (void) fflush(stdout); pt_shown = B_TRUE; } else if (pt_shown) { int len = strlen(update); (void) printf("%s%*.*s", update, len, len, pt_reverse); (void) fflush(stdout); } } static void finish_progress(const char *done) { if (pt_shown) { (void) puts(done); (void) fflush(stdout); } free(pt_header); pt_header = NULL; } static int zfs_mount_and_share(libzfs_handle_t *hdl, const char *dataset, zfs_type_t type) { zfs_handle_t *zhp = NULL; int ret = 0; zhp = zfs_open(hdl, dataset, type); if (zhp == NULL) return (1); /* * Volumes may neither be mounted or shared. Potentially in the * future filesystems detected on these volumes could be mounted. */ if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME) { zfs_close(zhp); return (0); } /* * Mount and/or share the new filesystem as appropriate. We provide a * verbose error message to let the user know that their filesystem was * in fact created, even if we failed to mount or share it. * * If the user doesn't want the dataset automatically mounted, then * skip the mount/share step */ if (zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, type, B_FALSE) && zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON) { if (zfs_mount_delegation_check()) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but it may only be " "mounted by root\n")); ret = 1; } else if (zfs_mount(zhp, NULL, 0) != 0) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but not mounted\n")); ret = 1; } else if (zfs_share(zhp, NULL) != 0) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but not shared\n")); ret = 1; } zfs_commit_shares(NULL); } zfs_close(zhp); return (ret); } /* * zfs clone [-p] [-o prop=value] ... * * Given an existing dataset, create a writable copy whose initial contents * are the same as the source. The newly created dataset maintains a * dependency on the original; the original cannot be destroyed so long as * the clone exists. * * The '-p' flag creates all the non-existing ancestors of the target first. */ static int zfs_do_clone(int argc, char **argv) { zfs_handle_t *zhp = NULL; boolean_t parents = B_FALSE; nvlist_t *props; int ret = 0; int c; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, "o:p")) != -1) { switch (c) { case 'o': if (!parseprop(props, optarg)) { nvlist_free(props); return (1); } break; case 'p': parents = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing source dataset " "argument\n")); goto usage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing target dataset " "argument\n")); goto usage; } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); goto usage; } /* open the source dataset */ if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) { nvlist_free(props); return (1); } if (parents && zfs_name_valid(argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { /* * Now create the ancestors of the target dataset. If the * target already exists and '-p' option was used we should not * complain. */ if (zfs_dataset_exists(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { zfs_close(zhp); nvlist_free(props); return (0); } if (zfs_create_ancestors(g_zfs, argv[1]) != 0) { zfs_close(zhp); nvlist_free(props); return (1); } } /* pass to libzfs */ ret = zfs_clone(zhp, argv[1], props); /* create the mountpoint if necessary */ if (ret == 0) { if (log_history) { (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } ret = zfs_mount_and_share(g_zfs, argv[1], ZFS_TYPE_DATASET); } zfs_close(zhp); nvlist_free(props); return (!!ret); usage: ASSERT0P(zhp); nvlist_free(props); usage(B_FALSE); return (-1); } /* - * Return a default volblocksize for the pool which always uses more than - * half of the data sectors. This primarily applies to dRAID which always - * writes full stripe widths. + * Calculate the minimum allocation size based on the top-level vdevs. */ static uint64_t -default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) +calculate_volblocksize(nvlist_t *config) { - uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; + uint64_t asize = SPA_MINBLOCKSIZE; nvlist_t *tree, **vdevs; uint_t nvdevs; - nvlist_t *config = zpool_get_config(zhp, NULL); - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &vdevs, &nvdevs) != 0) { return (ZVOL_DEFAULT_BLOCKSIZE); } for (int i = 0; i < nvdevs; i++) { nvlist_t *nv = vdevs[i]; uint64_t ashift, ndata, nparity; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &ashift) != 0) continue; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, &ndata) == 0) { /* dRAID minimum allocation width */ asize = MAX(asize, ndata * (1ULL << ashift)); } else if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { /* raidz minimum allocation width */ if (nparity == 1) asize = MAX(asize, 2 * (1ULL << ashift)); else asize = MAX(asize, 4 * (1ULL << ashift)); } else { /* mirror or (non-redundant) leaf vdev */ asize = MAX(asize, 1ULL << ashift); } } + return (asize); +} + +/* + * Return a default volblocksize for the pool which always uses more than + * half of the data sectors. This primarily applies to dRAID which always + * writes full stripe widths. + */ +static uint64_t +default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) +{ + uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; + + nvlist_t *config = zpool_get_config(zhp, NULL); + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, &asize) != 0) + asize = calculate_volblocksize(config); + /* * Calculate the target volblocksize such that more than half * of the asize is used. The following table is for 4k sectors. * * n asize blksz used | n asize blksz used * -------------------------+--------------------------------- * 1 4,096 8,192 100% | 9 36,864 32,768 88% * 2 8,192 8,192 100% | 10 40,960 32,768 80% * 3 12,288 8,192 66% | 11 45,056 32,768 72% * 4 16,384 16,384 100% | 12 49,152 32,768 66% * 5 20,480 16,384 80% | 13 53,248 32,768 61% * 6 24,576 16,384 66% | 14 57,344 32,768 57% * 7 28,672 16,384 57% | 15 61,440 32,768 53% * 8 32,768 32,768 100% | 16 65,536 65,636 100% * * This is primarily a concern for dRAID which always allocates * a full stripe width. For dRAID the default stripe width is * n=8 in which case the volblocksize is set to 32k. Ignoring * compression there are no unused sectors. This same reasoning * applies to raidz[2,3] so target 4 sectors to minimize waste. */ uint64_t tgt_volblocksize = ZVOL_DEFAULT_BLOCKSIZE; while (tgt_volblocksize * 2 <= asize) tgt_volblocksize *= 2; const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE); if (nvlist_lookup_uint64(props, prop, &volblocksize) == 0) { /* Issue a warning when a non-optimal size is requested. */ if (volblocksize < ZVOL_DEFAULT_BLOCKSIZE) { (void) fprintf(stderr, gettext("Warning: " "volblocksize (%llu) is less than the default " "minimum block size (%llu).\nTo reduce wasted " "space a volblocksize of %llu is recommended.\n"), (u_longlong_t)volblocksize, (u_longlong_t)ZVOL_DEFAULT_BLOCKSIZE, (u_longlong_t)tgt_volblocksize); } else if (volblocksize < tgt_volblocksize) { (void) fprintf(stderr, gettext("Warning: " "volblocksize (%llu) is much less than the " "minimum allocation\nunit (%llu), which wastes " "at least %llu%% of space. To reduce wasted " "space,\nuse a larger volblocksize (%llu is " "recommended), fewer dRAID data disks\n" "per group, or smaller sector size (ashift).\n"), (u_longlong_t)volblocksize, (u_longlong_t)asize, (u_longlong_t)((100 * (asize - volblocksize)) / asize), (u_longlong_t)tgt_volblocksize); } } else { volblocksize = tgt_volblocksize; fnvlist_add_uint64(props, prop, volblocksize); } return (volblocksize); } /* * zfs create [-Pnpv] [-o prop=value] ... fs * zfs create [-Pnpsv] [-b blocksize] [-o prop=value] ... -V vol size * * Create a new dataset. This command can be used to create filesystems * and volumes. Snapshot creation is handled by 'zfs snapshot'. * For volumes, the user must specify a size to be used. * * The '-s' flag applies only to volumes, and indicates that we should not try * to set the reservation for this volume. By default we set a reservation * equal to the size for any volume. For pools with SPA_VERSION >= * SPA_VERSION_REFRESERVATION, we set a refreservation instead. * * The '-p' flag creates all the non-existing ancestors of the target first. * * The '-n' flag is no-op (dry run) mode. This will perform a user-space sanity * check of arguments and properties, but does not check for permissions, * available space, etc. * * The '-u' flag prevents the newly created file system from being mounted. * * The '-v' flag is for verbose output. * * The '-P' flag is used for parseable output. It implies '-v'. */ static int zfs_do_create(int argc, char **argv) { zfs_type_t type = ZFS_TYPE_FILESYSTEM; zpool_handle_t *zpool_handle = NULL; nvlist_t *real_props = NULL; uint64_t volsize = 0; int c; boolean_t noreserve = B_FALSE; boolean_t bflag = B_FALSE; boolean_t parents = B_FALSE; boolean_t dryrun = B_FALSE; boolean_t nomount = B_FALSE; boolean_t verbose = B_FALSE; boolean_t parseable = B_FALSE; int ret = 1; nvlist_t *props; uint64_t intval; const char *strval; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, ":PV:b:nso:puv")) != -1) { switch (c) { case 'V': type = ZFS_TYPE_VOLUME; if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { (void) fprintf(stderr, gettext("bad volume " "size '%s': %s\n"), optarg, libzfs_error_description(g_zfs)); goto error; } if (nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0) nomem(); volsize = intval; break; case 'P': verbose = B_TRUE; parseable = B_TRUE; break; case 'p': parents = B_TRUE; break; case 'b': bflag = B_TRUE; if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { (void) fprintf(stderr, gettext("bad volume " "block size '%s': %s\n"), optarg, libzfs_error_description(g_zfs)); goto error; } if (nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), intval) != 0) nomem(); break; case 'n': dryrun = B_TRUE; break; case 'o': if (!parseprop(props, optarg)) goto error; break; case 's': noreserve = B_TRUE; break; case 'u': nomount = B_TRUE; break; case 'v': verbose = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing size " "argument\n")); goto badusage; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto badusage; } } if ((bflag || noreserve) && type != ZFS_TYPE_VOLUME) { (void) fprintf(stderr, gettext("'-s' and '-b' can only be " "used when creating a volume\n")); goto badusage; } if (nomount && type != ZFS_TYPE_FILESYSTEM) { (void) fprintf(stderr, gettext("'-u' can only be " "used when creating a filesystem\n")); goto badusage; } argc -= optind; argv += optind; /* check number of arguments */ if (argc == 0) { (void) fprintf(stderr, gettext("missing %s argument\n"), zfs_type_to_name(type)); goto badusage; } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); goto badusage; } if (dryrun || type == ZFS_TYPE_VOLUME) { char msg[ZFS_MAX_DATASET_NAME_LEN * 2]; char *p; if ((p = strchr(argv[0], '/')) != NULL) *p = '\0'; zpool_handle = zpool_open(g_zfs, argv[0]); if (p != NULL) *p = '/'; if (zpool_handle == NULL) goto error; (void) snprintf(msg, sizeof (msg), dryrun ? gettext("cannot verify '%s'") : gettext("cannot create '%s'"), argv[0]); if (props && (real_props = zfs_valid_proplist(g_zfs, type, props, 0, NULL, zpool_handle, B_TRUE, msg)) == NULL) { zpool_close(zpool_handle); goto error; } } if (type == ZFS_TYPE_VOLUME) { const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE); uint64_t volblocksize = default_volblocksize(zpool_handle, real_props); if (volblocksize != ZVOL_DEFAULT_BLOCKSIZE && nvlist_lookup_string(props, prop, &strval) != 0) { char *tmp; if (asprintf(&tmp, "%llu", (u_longlong_t)volblocksize) == -1) nomem(); nvlist_add_string(props, prop, tmp); free(tmp); } /* * If volsize is not a multiple of volblocksize, round it * up to the nearest multiple of the volblocksize. */ if (volsize % volblocksize) { volsize = P2ROUNDUP_TYPED(volsize, volblocksize, uint64_t); if (nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLSIZE), volsize) != 0) { nvlist_free(props); nomem(); } } } if (type == ZFS_TYPE_VOLUME && !noreserve) { uint64_t spa_version; zfs_prop_t resv_prop; spa_version = zpool_get_prop_int(zpool_handle, ZPOOL_PROP_VERSION, NULL); if (spa_version >= SPA_VERSION_REFRESERVATION) resv_prop = ZFS_PROP_REFRESERVATION; else resv_prop = ZFS_PROP_RESERVATION; volsize = zvol_volsize_to_reservation(zpool_handle, volsize, real_props); if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop), &strval) != 0) { if (nvlist_add_uint64(props, zfs_prop_to_name(resv_prop), volsize) != 0) { nvlist_free(props); nomem(); } } } if (zpool_handle != NULL) { zpool_close(zpool_handle); nvlist_free(real_props); } if (parents && zfs_name_valid(argv[0], type)) { /* * Now create the ancestors of target dataset. If the target * already exists and '-p' option was used we should not * complain. */ if (zfs_dataset_exists(g_zfs, argv[0], type)) { ret = 0; goto error; } if (verbose) { (void) printf(parseable ? "create_ancestors\t%s\n" : dryrun ? "would create ancestors of %s\n" : "create ancestors of %s\n", argv[0]); } if (!dryrun) { if (zfs_create_ancestors(g_zfs, argv[0]) != 0) { goto error; } } } if (verbose) { nvpair_t *nvp = NULL; (void) printf(parseable ? "create\t%s\n" : dryrun ? "would create %s\n" : "create %s\n", argv[0]); while ((nvp = nvlist_next_nvpair(props, nvp)) != NULL) { uint64_t uval; const char *sval; switch (nvpair_type(nvp)) { case DATA_TYPE_UINT64: VERIFY0(nvpair_value_uint64(nvp, &uval)); (void) printf(parseable ? "property\t%s\t%llu\n" : "\t%s=%llu\n", nvpair_name(nvp), (u_longlong_t)uval); break; case DATA_TYPE_STRING: VERIFY0(nvpair_value_string(nvp, &sval)); (void) printf(parseable ? "property\t%s\t%s\n" : "\t%s=%s\n", nvpair_name(nvp), sval); break; default: (void) fprintf(stderr, "property '%s' " "has illegal type %d\n", nvpair_name(nvp), nvpair_type(nvp)); abort(); } } } if (dryrun) { ret = 0; goto error; } /* pass to libzfs */ if (zfs_create(g_zfs, argv[0], type, props) != 0) goto error; if (log_history) { (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (nomount) { ret = 0; goto error; } ret = zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET); error: nvlist_free(props); return (ret); badusage: nvlist_free(props); usage(B_FALSE); return (2); } /* * zfs destroy [-rRf] * zfs destroy [-rRd] * * -r Recursively destroy all children * -R Recursively destroy all dependents, including clones * -f Force unmounting of any dependents * -d If we can't destroy now, mark for deferred destruction * * Destroys the given dataset. By default, it will unmount any filesystems, * and refuse to destroy a dataset that has any dependents. A dependent can * either be a child, or a clone of a child. */ typedef struct destroy_cbdata { boolean_t cb_first; boolean_t cb_force; boolean_t cb_recurse; boolean_t cb_error; boolean_t cb_doclones; zfs_handle_t *cb_target; boolean_t cb_defer_destroy; boolean_t cb_verbose; boolean_t cb_parsable; boolean_t cb_dryrun; nvlist_t *cb_nvl; nvlist_t *cb_batchedsnaps; /* first snap in contiguous run */ char *cb_firstsnap; /* previous snap in contiguous run */ char *cb_prevsnap; int64_t cb_snapused; char *cb_snapspec; char *cb_bookmark; uint64_t cb_snap_count; } destroy_cbdata_t; /* * Check for any dependents based on the '-r' or '-R' flags. */ static int destroy_check_dependent(zfs_handle_t *zhp, void *data) { destroy_cbdata_t *cbp = data; const char *tname = zfs_get_name(cbp->cb_target); const char *name = zfs_get_name(zhp); if (strncmp(tname, name, strlen(tname)) == 0 && (name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) { /* * This is a direct descendant, not a clone somewhere else in * the hierarchy. */ if (cbp->cb_recurse) goto out; if (cbp->cb_first) { (void) fprintf(stderr, gettext("cannot destroy '%s': " "%s has children\n"), zfs_get_name(cbp->cb_target), zfs_type_to_name(zfs_get_type(cbp->cb_target))); (void) fprintf(stderr, gettext("use '-r' to destroy " "the following datasets:\n")); cbp->cb_first = B_FALSE; cbp->cb_error = B_TRUE; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); } else { /* * This is a clone. We only want to report this if the '-r' * wasn't specified, or the target is a snapshot. */ if (!cbp->cb_recurse && zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT) goto out; if (cbp->cb_first) { (void) fprintf(stderr, gettext("cannot destroy '%s': " "%s has dependent clones\n"), zfs_get_name(cbp->cb_target), zfs_type_to_name(zfs_get_type(cbp->cb_target))); (void) fprintf(stderr, gettext("use '-R' to destroy " "the following datasets:\n")); cbp->cb_first = B_FALSE; cbp->cb_error = B_TRUE; cbp->cb_dryrun = B_TRUE; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); } out: zfs_close(zhp); return (0); } static int destroy_batched(destroy_cbdata_t *cb) { int error = zfs_destroy_snaps_nvl(g_zfs, cb->cb_batchedsnaps, B_FALSE); fnvlist_free(cb->cb_batchedsnaps); cb->cb_batchedsnaps = fnvlist_alloc(); return (error); } static int destroy_callback(zfs_handle_t *zhp, void *data) { destroy_cbdata_t *cb = data; const char *name = zfs_get_name(zhp); int error; if (cb->cb_verbose) { if (cb->cb_parsable) { (void) printf("destroy\t%s\n", name); } else if (cb->cb_dryrun) { (void) printf(gettext("would destroy %s\n"), name); } else { (void) printf(gettext("will destroy %s\n"), name); } } /* * Ignore pools (which we've already flagged as an error before getting * here). */ if (strchr(zfs_get_name(zhp), '/') == NULL && zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { zfs_close(zhp); return (0); } if (cb->cb_dryrun) { zfs_close(zhp); return (0); } /* * We batch up all contiguous snapshots (even of different * filesystems) and destroy them with one ioctl. We can't * simply do all snap deletions and then all fs deletions, * because we must delete a clone before its origin. */ if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) { cb->cb_snap_count++; fnvlist_add_boolean(cb->cb_batchedsnaps, name); if (cb->cb_snap_count % 10 == 0 && cb->cb_defer_destroy) { error = destroy_batched(cb); if (error != 0) { zfs_close(zhp); return (-1); } } } else { error = destroy_batched(cb); if (error != 0 || zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 || zfs_destroy(zhp, cb->cb_defer_destroy) != 0) { zfs_close(zhp); /* * When performing a recursive destroy we ignore errors * so that the recursive destroy could continue * destroying past problem datasets */ if (cb->cb_recurse) { cb->cb_error = B_TRUE; return (0); } return (-1); } } zfs_close(zhp); return (0); } static int destroy_print_cb(zfs_handle_t *zhp, void *arg) { destroy_cbdata_t *cb = arg; const char *name = zfs_get_name(zhp); int err = 0; if (nvlist_exists(cb->cb_nvl, name)) { if (cb->cb_firstsnap == NULL) cb->cb_firstsnap = strdup(name); if (cb->cb_prevsnap != NULL) free(cb->cb_prevsnap); /* this snap continues the current range */ cb->cb_prevsnap = strdup(name); if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL) nomem(); if (cb->cb_verbose) { if (cb->cb_parsable) { (void) printf("destroy\t%s\n", name); } else if (cb->cb_dryrun) { (void) printf(gettext("would destroy %s\n"), name); } else { (void) printf(gettext("will destroy %s\n"), name); } } } else if (cb->cb_firstsnap != NULL) { /* end of this range */ uint64_t used = 0; err = lzc_snaprange_space(cb->cb_firstsnap, cb->cb_prevsnap, &used); cb->cb_snapused += used; free(cb->cb_firstsnap); cb->cb_firstsnap = NULL; free(cb->cb_prevsnap); cb->cb_prevsnap = NULL; } zfs_close(zhp); return (err); } static int destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb) { int err; assert(cb->cb_firstsnap == NULL); assert(cb->cb_prevsnap == NULL); err = zfs_iter_snapshots_sorted_v2(fs_zhp, 0, destroy_print_cb, cb, 0, 0); if (cb->cb_firstsnap != NULL) { uint64_t used = 0; if (err == 0) { err = lzc_snaprange_space(cb->cb_firstsnap, cb->cb_prevsnap, &used); } cb->cb_snapused += used; free(cb->cb_firstsnap); cb->cb_firstsnap = NULL; free(cb->cb_prevsnap); cb->cb_prevsnap = NULL; } return (err); } static int snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg) { destroy_cbdata_t *cb = arg; int err = 0; /* Check for clones. */ if (!cb->cb_doclones && !cb->cb_defer_destroy) { cb->cb_target = zhp; cb->cb_first = B_TRUE; err = zfs_iter_dependents_v2(zhp, 0, B_TRUE, destroy_check_dependent, cb); } if (err == 0) { if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp))) nomem(); } zfs_close(zhp); return (err); } static int gather_snapshots(zfs_handle_t *zhp, void *arg) { destroy_cbdata_t *cb = arg; int err = 0; err = zfs_iter_snapspec_v2(zhp, 0, cb->cb_snapspec, snapshot_to_nvl_cb, cb); if (err == ENOENT) err = 0; if (err != 0) goto out; if (cb->cb_verbose) { err = destroy_print_snapshots(zhp, cb); if (err != 0) goto out; } if (cb->cb_recurse) err = zfs_iter_filesystems_v2(zhp, 0, gather_snapshots, cb); out: zfs_close(zhp); return (err); } static int destroy_clones(destroy_cbdata_t *cb) { nvpair_t *pair; for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL); pair != NULL; pair = nvlist_next_nvpair(cb->cb_nvl, pair)) { zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair), ZFS_TYPE_SNAPSHOT); if (zhp != NULL) { boolean_t defer = cb->cb_defer_destroy; int err; /* * We can't defer destroy non-snapshots, so set it to * false while destroying the clones. */ cb->cb_defer_destroy = B_FALSE; err = zfs_iter_dependents_v2(zhp, 0, B_FALSE, destroy_callback, cb); cb->cb_defer_destroy = defer; zfs_close(zhp); if (err != 0) return (err); } } return (0); } static int zfs_do_destroy(int argc, char **argv) { destroy_cbdata_t cb = { 0 }; int rv = 0; int err = 0; int c; zfs_handle_t *zhp = NULL; char *at, *pound; zfs_type_t type = ZFS_TYPE_DATASET; /* check options */ while ((c = getopt(argc, argv, "vpndfrR")) != -1) { switch (c) { case 'v': cb.cb_verbose = B_TRUE; break; case 'p': cb.cb_verbose = B_TRUE; cb.cb_parsable = B_TRUE; break; case 'n': cb.cb_dryrun = B_TRUE; break; case 'd': cb.cb_defer_destroy = B_TRUE; type = ZFS_TYPE_SNAPSHOT; break; case 'f': cb.cb_force = B_TRUE; break; case 'r': cb.cb_recurse = B_TRUE; break; case 'R': cb.cb_recurse = B_TRUE; cb.cb_doclones = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc == 0) { (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } at = strchr(argv[0], '@'); pound = strchr(argv[0], '#'); if (at != NULL) { /* Build the list of snaps to destroy in cb_nvl. */ cb.cb_nvl = fnvlist_alloc(); *at = '\0'; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { nvlist_free(cb.cb_nvl); return (1); } cb.cb_snapspec = at + 1; if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 || cb.cb_error) { rv = 1; goto out; } if (nvlist_empty(cb.cb_nvl)) { (void) fprintf(stderr, gettext("could not find any " "snapshots to destroy; check snapshot names.\n")); rv = 1; goto out; } if (cb.cb_verbose) { char buf[16]; zfs_nicebytes(cb.cb_snapused, buf, sizeof (buf)); if (cb.cb_parsable) { (void) printf("reclaim\t%llu\n", (u_longlong_t)cb.cb_snapused); } else if (cb.cb_dryrun) { (void) printf(gettext("would reclaim %s\n"), buf); } else { (void) printf(gettext("will reclaim %s\n"), buf); } } if (!cb.cb_dryrun) { if (cb.cb_doclones) { cb.cb_batchedsnaps = fnvlist_alloc(); err = destroy_clones(&cb); if (err == 0) { err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_batchedsnaps, B_FALSE); } if (err != 0) { rv = 1; goto out; } } if (err == 0) { err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl, cb.cb_defer_destroy); } } if (err != 0) rv = 1; } else if (pound != NULL) { int err; nvlist_t *nvl; if (cb.cb_dryrun) { (void) fprintf(stderr, "dryrun is not supported with bookmark\n"); return (-1); } if (cb.cb_defer_destroy) { (void) fprintf(stderr, "defer destroy is not supported with bookmark\n"); return (-1); } if (cb.cb_recurse) { (void) fprintf(stderr, "recursive is not supported with bookmark\n"); return (-1); } /* * Unfortunately, zfs_bookmark() doesn't honor the * casesensitivity setting. However, we can't simply * remove this check, because lzc_destroy_bookmarks() * ignores non-existent bookmarks, so this is necessary * to get a proper error message. */ if (!zfs_bookmark_exists(argv[0])) { (void) fprintf(stderr, gettext("bookmark '%s' " "does not exist.\n"), argv[0]); return (1); } nvl = fnvlist_alloc(); fnvlist_add_boolean(nvl, argv[0]); err = lzc_destroy_bookmarks(nvl, NULL); if (err != 0) { (void) zfs_standard_error(g_zfs, err, "cannot destroy bookmark"); } nvlist_free(nvl); return (err); } else { /* Open the given dataset */ if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL) return (1); cb.cb_target = zhp; /* * Perform an explicit check for pools before going any further. */ if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL && zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { (void) fprintf(stderr, gettext("cannot destroy '%s': " "operation does not apply to pools\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use 'zfs destroy -r " "%s' to destroy all datasets in the pool\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use 'zpool destroy %s' " "to destroy the pool itself\n"), zfs_get_name(zhp)); rv = 1; goto out; } /* * Check for any dependents and/or clones. */ cb.cb_first = B_TRUE; if (!cb.cb_doclones && zfs_iter_dependents_v2(zhp, 0, B_TRUE, destroy_check_dependent, &cb) != 0) { rv = 1; goto out; } if (cb.cb_error) { rv = 1; goto out; } cb.cb_batchedsnaps = fnvlist_alloc(); if (zfs_iter_dependents_v2(zhp, 0, B_FALSE, destroy_callback, &cb) != 0) { rv = 1; goto out; } /* * Do the real thing. The callback will close the * handle regardless of whether it succeeds or not. */ err = destroy_callback(zhp, &cb); zhp = NULL; if (err == 0) { err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_batchedsnaps, cb.cb_defer_destroy); } if (err != 0 || cb.cb_error == B_TRUE) rv = 1; } out: fnvlist_free(cb.cb_batchedsnaps); fnvlist_free(cb.cb_nvl); if (zhp != NULL) zfs_close(zhp); return (rv); } static boolean_t is_recvd_column(zprop_get_cbdata_t *cbp) { int i; zfs_get_column_t col; for (i = 0; i < ZFS_GET_NCOLS && (col = cbp->cb_columns[i]) != GET_COL_NONE; i++) if (col == GET_COL_RECVD) return (B_TRUE); return (B_FALSE); } /* * Generates an nvlist with output version for every command based on params. * Purpose of this is to add a version of JSON output, considering the schema * format might be updated for each command in future. * * Schema: * * "output_version": { * "command": string, * "vers_major": integer, * "vers_minor": integer, * } */ static nvlist_t * zfs_json_schema(int maj_v, int min_v) { nvlist_t *sch = NULL; nvlist_t *ov = NULL; char cmd[MAX_CMD_LEN]; snprintf(cmd, MAX_CMD_LEN, "zfs %s", current_command->name); sch = fnvlist_alloc(); ov = fnvlist_alloc(); fnvlist_add_string(ov, "command", cmd); fnvlist_add_uint32(ov, "vers_major", maj_v); fnvlist_add_uint32(ov, "vers_minor", min_v); fnvlist_add_nvlist(sch, "output_version", ov); fnvlist_free(ov); return (sch); } static void fill_dataset_info(nvlist_t *list, zfs_handle_t *zhp, boolean_t as_int) { char createtxg[ZFS_MAXPROPLEN]; zfs_type_t type = zfs_get_type(zhp); nvlist_add_string(list, "name", zfs_get_name(zhp)); switch (type) { case ZFS_TYPE_FILESYSTEM: fnvlist_add_string(list, "type", "FILESYSTEM"); break; case ZFS_TYPE_VOLUME: fnvlist_add_string(list, "type", "VOLUME"); break; case ZFS_TYPE_SNAPSHOT: fnvlist_add_string(list, "type", "SNAPSHOT"); break; case ZFS_TYPE_POOL: fnvlist_add_string(list, "type", "POOL"); break; case ZFS_TYPE_BOOKMARK: fnvlist_add_string(list, "type", "BOOKMARK"); break; default: fnvlist_add_string(list, "type", "UNKNOWN"); break; } if (type != ZFS_TYPE_POOL) fnvlist_add_string(list, "pool", zfs_get_pool_name(zhp)); if (as_int) { fnvlist_add_uint64(list, "createtxg", zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG)); } else { if (zfs_prop_get(zhp, ZFS_PROP_CREATETXG, createtxg, sizeof (createtxg), NULL, NULL, 0, B_TRUE) == 0) fnvlist_add_string(list, "createtxg", createtxg); } if (type == ZFS_TYPE_SNAPSHOT) { char *snap = strdup(zfs_get_name(zhp)); char *ds = strsep(&snap, "@"); fnvlist_add_string(list, "dataset", ds); fnvlist_add_string(list, "snapshot_name", snap); free(ds); } } /* * zfs get [-rHp] [-j [--json-int]] [-o all | field[,field]...] * [-s source[,source]...] * < all | property[,property]... > < fs | snap | vol > ... * * -r recurse over any child datasets * -H scripted mode. Headers are stripped, and fields are separated * by tabs instead of spaces. * -o Set of fields to display. One of "name,property,value, * received,source". Default is "name,property,value,source". * "all" is an alias for all five. * -s Set of sources to allow. One of * "local,default,inherited,received,temporary,none". Default is * all six. * -p Display values in parsable (literal) format. * -j Display output in JSON format. * --json-int Display numbers as integers instead of strings. * * Prints properties for the given datasets. The user can control which * columns to display as well as which property types to allow. */ /* * Invoked to display the properties for a single dataset. */ static int get_callback(zfs_handle_t *zhp, void *data) { char buf[ZFS_MAXPROPLEN]; char rbuf[ZFS_MAXPROPLEN]; zprop_source_t sourcetype; char source[ZFS_MAX_DATASET_NAME_LEN]; zprop_get_cbdata_t *cbp = data; nvlist_t *user_props = zfs_get_user_props(zhp); zprop_list_t *pl = cbp->cb_proplist; nvlist_t *propval; nvlist_t *item, *d = NULL, *props = NULL; const char *strval; const char *sourceval; boolean_t received = is_recvd_column(cbp); int err = 0; if (cbp->cb_json) { d = fnvlist_lookup_nvlist(cbp->cb_jsobj, "datasets"); if (d == NULL) { fprintf(stderr, "datasets obj not found.\n"); exit(1); } props = fnvlist_alloc(); } for (; pl != NULL; pl = pl->pl_next) { char *recvdval = NULL; /* * Skip the special fake placeholder. This will also skip over * the name property when 'all' is specified. */ if (pl->pl_prop == ZFS_PROP_NAME && pl == cbp->cb_proplist) continue; if (pl->pl_prop != ZPROP_USERPROP) { if (zfs_prop_get(zhp, pl->pl_prop, buf, sizeof (buf), &sourcetype, source, sizeof (source), cbp->cb_literal) != 0) { if (pl->pl_all) continue; if (!zfs_prop_valid_for_type(pl->pl_prop, ZFS_TYPE_DATASET, B_FALSE)) { (void) fprintf(stderr, gettext("No such property '%s'\n"), zfs_prop_to_name(pl->pl_prop)); continue; } sourcetype = ZPROP_SRC_NONE; (void) strlcpy(buf, "-", sizeof (buf)); } if (received && (zfs_prop_get_recvd(zhp, zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf), cbp->cb_literal) == 0)) recvdval = rbuf; err = zprop_collect_property(zfs_get_name(zhp), cbp, zfs_prop_to_name(pl->pl_prop), buf, sourcetype, source, recvdval, props); } else if (zfs_prop_userquota(pl->pl_user_prop)) { sourcetype = ZPROP_SRC_LOCAL; if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, buf, sizeof (buf), cbp->cb_literal) != 0) { sourcetype = ZPROP_SRC_NONE; (void) strlcpy(buf, "-", sizeof (buf)); } err = zprop_collect_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, buf, sourcetype, source, NULL, props); } else if (zfs_prop_written(pl->pl_user_prop)) { sourcetype = ZPROP_SRC_LOCAL; if (zfs_prop_get_written(zhp, pl->pl_user_prop, buf, sizeof (buf), cbp->cb_literal) != 0) { sourcetype = ZPROP_SRC_NONE; (void) strlcpy(buf, "-", sizeof (buf)); } err = zprop_collect_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, buf, sourcetype, source, NULL, props); } else { if (nvlist_lookup_nvlist(user_props, pl->pl_user_prop, &propval) != 0) { if (pl->pl_all) continue; sourcetype = ZPROP_SRC_NONE; strval = "-"; } else { strval = fnvlist_lookup_string(propval, ZPROP_VALUE); sourceval = fnvlist_lookup_string(propval, ZPROP_SOURCE); if (strcmp(sourceval, zfs_get_name(zhp)) == 0) { sourcetype = ZPROP_SRC_LOCAL; } else if (strcmp(sourceval, ZPROP_SOURCE_VAL_RECVD) == 0) { sourcetype = ZPROP_SRC_RECEIVED; } else { sourcetype = ZPROP_SRC_INHERITED; (void) strlcpy(source, sourceval, sizeof (source)); } } if (received && (zfs_prop_get_recvd(zhp, pl->pl_user_prop, rbuf, sizeof (rbuf), cbp->cb_literal) == 0)) recvdval = rbuf; err = zprop_collect_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, strval, sourcetype, source, recvdval, props); } if (err != 0) return (err); } if (cbp->cb_json) { if (!nvlist_empty(props)) { item = fnvlist_alloc(); fill_dataset_info(item, zhp, cbp->cb_json_as_int); fnvlist_add_nvlist(item, "properties", props); fnvlist_add_nvlist(d, zfs_get_name(zhp), item); fnvlist_free(props); fnvlist_free(item); } else { fnvlist_free(props); } } return (0); } static int zfs_do_get(int argc, char **argv) { zprop_get_cbdata_t cb = { 0 }; int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS; int types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK; char *fields; int ret = 0; int limit = 0; zprop_list_t fake_name = { 0 }; nvlist_t *data; /* * Set up default columns and sources. */ cb.cb_sources = ZPROP_SRC_ALL; cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; cb.cb_type = ZFS_TYPE_DATASET; struct option long_options[] = { {"json", no_argument, NULL, 'j'}, {"json-int", no_argument, NULL, ZFS_OPTION_JSON_NUMS_AS_INT}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, ":d:o:s:jrt:Hp", long_options, NULL)) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; break; case 'd': limit = parse_depth(optarg, &flags); break; case 'r': flags |= ZFS_ITER_RECURSE; break; case 'H': cb.cb_scripted = B_TRUE; break; case 'j': cb.cb_json = B_TRUE; cb.cb_jsobj = zfs_json_schema(0, 1); data = fnvlist_alloc(); fnvlist_add_nvlist(cb.cb_jsobj, "datasets", data); fnvlist_free(data); break; case ZFS_OPTION_JSON_NUMS_AS_INT: cb.cb_json_as_int = B_TRUE; cb.cb_literal = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case 'o': /* * Process the set of columns to display. We zero out * the structure to give us a blank slate. */ memset(&cb.cb_columns, 0, sizeof (cb.cb_columns)); i = 0; for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const col_subopts[] = { "name", "property", "value", "received", "source", "all" }; static const zfs_get_column_t col_subopt_col[] = { GET_COL_NAME, GET_COL_PROPERTY, GET_COL_VALUE, GET_COL_RECVD, GET_COL_SOURCE }; static const int col_subopt_flags[] = { 0, 0, 0, ZFS_ITER_RECVD_PROPS, 0 }; if (i == ZFS_GET_NCOLS) { (void) fprintf(stderr, gettext("too " "many fields given to -o " "option\n")); usage(B_FALSE); } for (c = 0; c < ARRAY_SIZE(col_subopts); ++c) if (strcmp(tok, col_subopts[c]) == 0) goto found; (void) fprintf(stderr, gettext("invalid column name '%s'\n"), tok); usage(B_FALSE); found: if (c >= 5) { if (i > 0) { (void) fprintf(stderr, gettext("\"all\" conflicts " "with specific fields " "given to -o option\n")); usage(B_FALSE); } memcpy(cb.cb_columns, col_subopt_col, sizeof (col_subopt_col)); flags |= ZFS_ITER_RECVD_PROPS; i = ZFS_GET_NCOLS; } else { cb.cb_columns[i++] = col_subopt_col[c]; flags |= col_subopt_flags[c]; } } break; case 's': cb.cb_sources = 0; for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const source_opt[] = { "local", "default", "inherited", "received", "temporary", "none" }; static const int source_flg[] = { ZPROP_SRC_LOCAL, ZPROP_SRC_DEFAULT, ZPROP_SRC_INHERITED, ZPROP_SRC_RECEIVED, ZPROP_SRC_TEMPORARY, ZPROP_SRC_NONE }; for (i = 0; i < ARRAY_SIZE(source_opt); ++i) if (strcmp(tok, source_opt[i]) == 0) { cb.cb_sources |= source_flg[i]; goto found2; } (void) fprintf(stderr, gettext("invalid source '%s'\n"), tok); usage(B_FALSE); found2:; } break; case 't': types = 0; flags &= ~ZFS_ITER_PROP_LISTSNAPS; for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const type_opts[] = { "filesystem", "fs", "volume", "vol", "snapshot", "snap", "bookmark", "all" }; static const int type_types[] = { ZFS_TYPE_FILESYSTEM, ZFS_TYPE_FILESYSTEM, ZFS_TYPE_VOLUME, ZFS_TYPE_VOLUME, ZFS_TYPE_SNAPSHOT, ZFS_TYPE_SNAPSHOT, ZFS_TYPE_BOOKMARK, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK }; for (i = 0; i < ARRAY_SIZE(type_opts); ++i) if (strcmp(tok, type_opts[i]) == 0) { types |= type_types[i]; goto found3; } (void) fprintf(stderr, gettext("invalid type '%s'\n"), tok); usage(B_FALSE); found3:; } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing property " "argument\n")); usage(B_FALSE); } if (!cb.cb_json && cb.cb_json_as_int) { (void) fprintf(stderr, gettext("'--json-int' only works with" " '-j' option\n")); usage(B_FALSE); } fields = argv[0]; /* * Handle users who want to get all snapshots or bookmarks * of a dataset (ex. 'zfs get -t snapshot refer '). */ if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) && argc > 1 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) { flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE); limit = 1; } if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) != 0) usage(B_FALSE); argc--; argv++; /* * As part of zfs_expand_proplist(), we keep track of the maximum column * width for each property. For the 'NAME' (and 'SOURCE') columns, we * need to know the maximum name length. However, the user likely did * not specify 'name' as one of the properties to fetch, so we need to * make sure we always include at least this property for * print_get_headers() to work properly. */ if (cb.cb_proplist != NULL) { fake_name.pl_prop = ZFS_PROP_NAME; fake_name.pl_width = strlen(gettext("NAME")); fake_name.pl_next = cb.cb_proplist; cb.cb_proplist = &fake_name; } cb.cb_first = B_TRUE; /* run for each object */ ret = zfs_for_each(argc, argv, flags, types, NULL, &cb.cb_proplist, limit, get_callback, &cb); if (ret == 0 && cb.cb_json) zcmd_print_json(cb.cb_jsobj); else if (ret != 0 && cb.cb_json) nvlist_free(cb.cb_jsobj); if (cb.cb_proplist == &fake_name) zprop_free_list(fake_name.pl_next); else zprop_free_list(cb.cb_proplist); return (ret); } /* * inherit [-rS] ... * * -r Recurse over all children * -S Revert to received value, if any * * For each dataset specified on the command line, inherit the given property * from its parent. Inheriting a property at the pool level will cause it to * use the default value. The '-r' flag will recurse over all children, and is * useful for setting a property on a hierarchy-wide basis, regardless of any * local modifications for each dataset. */ typedef struct inherit_cbdata { const char *cb_propname; boolean_t cb_received; } inherit_cbdata_t; static int inherit_recurse_cb(zfs_handle_t *zhp, void *data) { inherit_cbdata_t *cb = data; zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname); /* * If we're doing it recursively, then ignore properties that * are not valid for this type of dataset. */ if (prop != ZPROP_INVAL && !zfs_prop_valid_for_type(prop, zfs_get_type(zhp), B_FALSE)) return (0); return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); } static int inherit_cb(zfs_handle_t *zhp, void *data) { inherit_cbdata_t *cb = data; return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); } static int zfs_do_inherit(int argc, char **argv) { int c; zfs_prop_t prop; inherit_cbdata_t cb = { 0 }; char *propname; int ret = 0; int flags = 0; boolean_t received = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "rS")) != -1) { switch (c) { case 'r': flags |= ZFS_ITER_RECURSE; break; case 'S': received = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing property argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } propname = argv[0]; argc--; argv++; if ((prop = zfs_name_to_prop(propname)) != ZPROP_USERPROP) { if (zfs_prop_readonly(prop)) { (void) fprintf(stderr, gettext( "%s property is read-only\n"), propname); return (1); } if (!zfs_prop_inheritable(prop) && !received) { (void) fprintf(stderr, gettext("'%s' property cannot " "be inherited\n"), propname); if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION || prop == ZFS_PROP_REFQUOTA || prop == ZFS_PROP_REFRESERVATION) { (void) fprintf(stderr, gettext("use 'zfs set " "%s=none' to clear\n"), propname); (void) fprintf(stderr, gettext("use 'zfs " "inherit -S %s' to revert to received " "value\n"), propname); } return (1); } if (received && (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION)) { (void) fprintf(stderr, gettext("'%s' property cannot " "be reverted to a received value\n"), propname); return (1); } } else if (!zfs_prop_user(propname)) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), propname); usage(B_FALSE); } cb.cb_propname = propname; cb.cb_received = received; if (flags & ZFS_ITER_RECURSE) { ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL, NULL, 0, inherit_recurse_cb, &cb); } else { ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL, NULL, 0, inherit_cb, &cb); } return (ret); } typedef struct upgrade_cbdata { uint64_t cb_numupgraded; uint64_t cb_numsamegraded; uint64_t cb_numfailed; uint64_t cb_version; boolean_t cb_newer; boolean_t cb_foundone; char cb_lastfs[ZFS_MAX_DATASET_NAME_LEN]; } upgrade_cbdata_t; static int same_pool(zfs_handle_t *zhp, const char *name) { int len1 = strcspn(name, "/@"); const char *zhname = zfs_get_name(zhp); int len2 = strcspn(zhname, "/@"); if (len1 != len2) return (B_FALSE); return (strncmp(name, zhname, len1) == 0); } static int upgrade_list_callback(zfs_handle_t *zhp, void *data) { upgrade_cbdata_t *cb = data; int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); /* list if it's old/new */ if ((!cb->cb_newer && version < ZPL_VERSION) || (cb->cb_newer && version > ZPL_VERSION)) { char *str; if (cb->cb_newer) { str = gettext("The following filesystems are " "formatted using a newer software version and\n" "cannot be accessed on the current system.\n\n"); } else { str = gettext("The following filesystems are " "out of date, and can be upgraded. After being\n" "upgraded, these filesystems (and any 'zfs send' " "streams generated from\n" "subsequent snapshots) will no longer be " "accessible by older software versions.\n\n"); } if (!cb->cb_foundone) { (void) puts(str); (void) printf(gettext("VER FILESYSTEM\n")); (void) printf(gettext("--- ------------\n")); cb->cb_foundone = B_TRUE; } (void) printf("%2u %s\n", version, zfs_get_name(zhp)); } return (0); } static int upgrade_set_callback(zfs_handle_t *zhp, void *data) { upgrade_cbdata_t *cb = data; int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); int needed_spa_version; int spa_version; if (zfs_spa_version(zhp, &spa_version) < 0) return (-1); needed_spa_version = zfs_spa_version_map(cb->cb_version); if (needed_spa_version < 0) return (-1); if (spa_version < needed_spa_version) { /* can't upgrade */ (void) printf(gettext("%s: can not be " "upgraded; the pool version needs to first " "be upgraded\nto version %d\n\n"), zfs_get_name(zhp), needed_spa_version); cb->cb_numfailed++; return (0); } /* upgrade */ if (version < cb->cb_version) { char verstr[24]; (void) snprintf(verstr, sizeof (verstr), "%llu", (u_longlong_t)cb->cb_version); if (cb->cb_lastfs[0] && !same_pool(zhp, cb->cb_lastfs)) { /* * If they did "zfs upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (zfs_prop_set(zhp, "version", verstr) == 0) cb->cb_numupgraded++; else cb->cb_numfailed++; (void) strlcpy(cb->cb_lastfs, zfs_get_name(zhp), sizeof (cb->cb_lastfs)); } else if (version > cb->cb_version) { /* can't downgrade */ (void) printf(gettext("%s: can not be downgraded; " "it is already at version %u\n"), zfs_get_name(zhp), version); cb->cb_numfailed++; } else { cb->cb_numsamegraded++; } return (0); } /* * zfs upgrade * zfs upgrade -v * zfs upgrade [-r] [-V ] <-a | filesystem> */ static int zfs_do_upgrade(int argc, char **argv) { boolean_t all = B_FALSE; boolean_t showversions = B_FALSE; int ret = 0; upgrade_cbdata_t cb = { 0 }; int c; int flags = ZFS_ITER_ARGS_CAN_BE_PATHS; /* check options */ while ((c = getopt(argc, argv, "rvV:a")) != -1) { switch (c) { case 'r': flags |= ZFS_ITER_RECURSE; break; case 'v': showversions = B_TRUE; break; case 'V': if (zfs_prop_string_to_index(ZFS_PROP_VERSION, optarg, &cb.cb_version) != 0) { (void) fprintf(stderr, gettext("invalid version %s\n"), optarg); usage(B_FALSE); } break; case 'a': all = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if ((!all && !argc) && ((flags & ZFS_ITER_RECURSE) | cb.cb_version)) usage(B_FALSE); if (showversions && (flags & ZFS_ITER_RECURSE || all || cb.cb_version || argc)) usage(B_FALSE); if ((all || argc) && (showversions)) usage(B_FALSE); if (all && argc) usage(B_FALSE); if (showversions) { /* Show info on available versions. */ (void) printf(gettext("The following filesystem versions are " "supported:\n\n")); (void) printf(gettext("VER DESCRIPTION\n")); (void) printf("--- -----------------------------------------" "---------------\n"); (void) printf(gettext(" 1 Initial ZFS filesystem version\n")); (void) printf(gettext(" 2 Enhanced directory entries\n")); (void) printf(gettext(" 3 Case insensitive and filesystem " "user identifier (FUID)\n")); (void) printf(gettext(" 4 userquota, groupquota " "properties\n")); (void) printf(gettext(" 5 System attributes\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases,\n")); (void) printf("see the ZFS Administration Guide.\n\n"); ret = 0; } else if (argc || all) { /* Upgrade filesystems */ if (cb.cb_version == 0) cb.cb_version = ZPL_VERSION; ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM, NULL, NULL, 0, upgrade_set_callback, &cb); (void) printf(gettext("%llu filesystems upgraded\n"), (u_longlong_t)cb.cb_numupgraded); if (cb.cb_numsamegraded) { (void) printf(gettext("%llu filesystems already at " "this version\n"), (u_longlong_t)cb.cb_numsamegraded); } if (cb.cb_numfailed != 0) ret = 1; } else { /* List old-version filesystems */ boolean_t found; (void) printf(gettext("This system is currently running " "ZFS filesystem version %llu.\n\n"), ZPL_VERSION); flags |= ZFS_ITER_RECURSE; ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, NULL, NULL, 0, upgrade_list_callback, &cb); found = cb.cb_foundone; cb.cb_foundone = B_FALSE; cb.cb_newer = B_TRUE; ret |= zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, NULL, NULL, 0, upgrade_list_callback, &cb); if (!cb.cb_foundone && !found) { (void) printf(gettext("All filesystems are " "formatted with the current version.\n")); } } return (ret); } /* * zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...] * [-S field [-S field]...] [-t type[,...]] * filesystem | snapshot | path * zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...] * [-S field [-S field]...] [-t type[,...]] * filesystem | snapshot | path * zfs projectspace [-Hp] [-o field[,...]] [-s field [-s field]...] * [-S field [-S field]...] filesystem | snapshot | path * * -H Scripted mode; elide headers and separate columns by tabs. * -i Translate SID to POSIX ID. * -n Print numeric ID instead of user/group name. * -o Control which fields to display. * -p Use exact (parsable) numeric output. * -s Specify sort columns, descending order. * -S Specify sort columns, ascending order. * -t Control which object types to display. * * Displays space consumed by, and quotas on, each user in the specified * filesystem or snapshot. */ /* us_field_types, us_field_hdr and us_field_names should be kept in sync */ enum us_field_types { USFIELD_TYPE, USFIELD_NAME, USFIELD_USED, USFIELD_QUOTA, USFIELD_OBJUSED, USFIELD_OBJQUOTA }; static const char *const us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA", "OBJUSED", "OBJQUOTA" }; static const char *const us_field_names[] = { "type", "name", "used", "quota", "objused", "objquota" }; #define USFIELD_LAST (sizeof (us_field_names) / sizeof (char *)) #define USTYPE_PSX_GRP (1 << 0) #define USTYPE_PSX_USR (1 << 1) #define USTYPE_SMB_GRP (1 << 2) #define USTYPE_SMB_USR (1 << 3) #define USTYPE_PROJ (1 << 4) #define USTYPE_ALL \ (USTYPE_PSX_GRP | USTYPE_PSX_USR | USTYPE_SMB_GRP | USTYPE_SMB_USR | \ USTYPE_PROJ) static int us_type_bits[] = { USTYPE_PSX_GRP, USTYPE_PSX_USR, USTYPE_SMB_GRP, USTYPE_SMB_USR, USTYPE_ALL }; static const char *const us_type_names[] = { "posixgroup", "posixuser", "smbgroup", "smbuser", "all" }; typedef struct us_node { nvlist_t *usn_nvl; uu_avl_node_t usn_avlnode; uu_list_node_t usn_listnode; } us_node_t; typedef struct us_cbdata { nvlist_t **cb_nvlp; uu_avl_pool_t *cb_avl_pool; uu_avl_t *cb_avl; boolean_t cb_numname; boolean_t cb_nicenum; boolean_t cb_sid2posix; zfs_userquota_prop_t cb_prop; zfs_sort_column_t *cb_sortcol; size_t cb_width[USFIELD_LAST]; } us_cbdata_t; static boolean_t us_populated = B_FALSE; typedef struct { zfs_sort_column_t *si_sortcol; boolean_t si_numname; } us_sort_info_t; static int us_field_index(const char *field) { for (int i = 0; i < USFIELD_LAST; i++) { if (strcmp(field, us_field_names[i]) == 0) return (i); } return (-1); } static int us_compare(const void *larg, const void *rarg, void *unused) { const us_node_t *l = larg; const us_node_t *r = rarg; us_sort_info_t *si = (us_sort_info_t *)unused; zfs_sort_column_t *sortcol = si->si_sortcol; boolean_t numname = si->si_numname; nvlist_t *lnvl = l->usn_nvl; nvlist_t *rnvl = r->usn_nvl; int rc = 0; boolean_t lvb, rvb; for (; sortcol != NULL; sortcol = sortcol->sc_next) { const char *lvstr = ""; const char *rvstr = ""; uint32_t lv32 = 0; uint32_t rv32 = 0; uint64_t lv64 = 0; uint64_t rv64 = 0; zfs_prop_t prop = sortcol->sc_prop; const char *propname = NULL; boolean_t reverse = sortcol->sc_reverse; switch (prop) { case ZFS_PROP_TYPE: propname = "type"; (void) nvlist_lookup_uint32(lnvl, propname, &lv32); (void) nvlist_lookup_uint32(rnvl, propname, &rv32); if (rv32 != lv32) rc = (rv32 < lv32) ? 1 : -1; break; case ZFS_PROP_NAME: propname = "name"; if (numname) { compare_nums: (void) nvlist_lookup_uint64(lnvl, propname, &lv64); (void) nvlist_lookup_uint64(rnvl, propname, &rv64); if (rv64 != lv64) rc = (rv64 < lv64) ? 1 : -1; } else { if ((nvlist_lookup_string(lnvl, propname, &lvstr) == ENOENT) || (nvlist_lookup_string(rnvl, propname, &rvstr) == ENOENT)) { goto compare_nums; } rc = strcmp(lvstr, rvstr); } break; case ZFS_PROP_USED: case ZFS_PROP_QUOTA: if (!us_populated) break; if (prop == ZFS_PROP_USED) propname = "used"; else propname = "quota"; (void) nvlist_lookup_uint64(lnvl, propname, &lv64); (void) nvlist_lookup_uint64(rnvl, propname, &rv64); if (rv64 != lv64) rc = (rv64 < lv64) ? 1 : -1; break; default: break; } if (rc != 0) { if (rc < 0) return (reverse ? 1 : -1); else return (reverse ? -1 : 1); } } /* * If entries still seem to be the same, check if they are of the same * type (smbentity is added only if we are doing SID to POSIX ID * translation where we can have duplicate type/name combinations). */ if (nvlist_lookup_boolean_value(lnvl, "smbentity", &lvb) == 0 && nvlist_lookup_boolean_value(rnvl, "smbentity", &rvb) == 0 && lvb != rvb) return (lvb < rvb ? -1 : 1); return (0); } static boolean_t zfs_prop_is_user(unsigned p) { return (p == ZFS_PROP_USERUSED || p == ZFS_PROP_USERQUOTA || p == ZFS_PROP_USEROBJUSED || p == ZFS_PROP_USEROBJQUOTA); } static boolean_t zfs_prop_is_group(unsigned p) { return (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA || p == ZFS_PROP_GROUPOBJUSED || p == ZFS_PROP_GROUPOBJQUOTA); } static boolean_t zfs_prop_is_project(unsigned p) { return (p == ZFS_PROP_PROJECTUSED || p == ZFS_PROP_PROJECTQUOTA || p == ZFS_PROP_PROJECTOBJUSED || p == ZFS_PROP_PROJECTOBJQUOTA); } static inline const char * us_type2str(unsigned field_type) { switch (field_type) { case USTYPE_PSX_USR: return ("POSIX User"); case USTYPE_PSX_GRP: return ("POSIX Group"); case USTYPE_SMB_USR: return ("SMB User"); case USTYPE_SMB_GRP: return ("SMB Group"); case USTYPE_PROJ: return ("Project"); default: return ("Undefined"); } } static int userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space, uint64_t default_quota) { us_cbdata_t *cb = (us_cbdata_t *)arg; zfs_userquota_prop_t prop = cb->cb_prop; char *name = NULL; const char *propname; char sizebuf[32]; us_node_t *node; uu_avl_pool_t *avl_pool = cb->cb_avl_pool; uu_avl_t *avl = cb->cb_avl; uu_avl_index_t idx; nvlist_t *props; us_node_t *n; zfs_sort_column_t *sortcol = cb->cb_sortcol; unsigned type = 0; const char *typestr; size_t namelen; size_t typelen; size_t sizelen; int typeidx, nameidx, sizeidx; us_sort_info_t sortinfo = { sortcol, cb->cb_numname }; boolean_t smbentity = B_FALSE; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); node = safe_malloc(sizeof (us_node_t)); uu_avl_node_init(node, &node->usn_avlnode, avl_pool); node->usn_nvl = props; if (domain != NULL && domain[0] != '\0') { #ifdef HAVE_IDMAP /* SMB */ char sid[MAXNAMELEN + 32]; uid_t id; uint64_t classes; int err; directory_error_t e; smbentity = B_TRUE; (void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid); if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) { type = USTYPE_SMB_GRP; err = sid_to_id(sid, B_FALSE, &id); } else { type = USTYPE_SMB_USR; err = sid_to_id(sid, B_TRUE, &id); } if (err == 0) { rid = id; if (!cb->cb_sid2posix) { e = directory_name_from_sid(NULL, sid, &name, &classes); if (e != NULL) directory_error_free(e); if (name == NULL) name = sid; } } #else nvlist_free(props); free(node); return (-1); #endif /* HAVE_IDMAP */ } if (cb->cb_sid2posix || domain == NULL || domain[0] == '\0') { /* POSIX or -i */ if (zfs_prop_is_group(prop)) { type = USTYPE_PSX_GRP; if (!cb->cb_numname) { struct group *g; if ((g = getgrgid(rid)) != NULL) name = g->gr_name; } } else if (zfs_prop_is_user(prop)) { type = USTYPE_PSX_USR; if (!cb->cb_numname) { struct passwd *p; if ((p = getpwuid(rid)) != NULL) name = p->pw_name; } } else { type = USTYPE_PROJ; } } /* * Make sure that the type/name combination is unique when doing * SID to POSIX ID translation (hence changing the type from SMB to * POSIX). */ if (cb->cb_sid2posix && nvlist_add_boolean_value(props, "smbentity", smbentity) != 0) nomem(); /* Calculate/update width of TYPE field */ typestr = us_type2str(type); typelen = strlen(gettext(typestr)); typeidx = us_field_index("type"); if (typelen > cb->cb_width[typeidx]) cb->cb_width[typeidx] = typelen; if (nvlist_add_uint32(props, "type", type) != 0) nomem(); /* Calculate/update width of NAME field */ if ((cb->cb_numname && cb->cb_sid2posix) || name == NULL) { if (nvlist_add_uint64(props, "name", rid) != 0) nomem(); namelen = snprintf(NULL, 0, "%u", rid); } else { if (nvlist_add_string(props, "name", name) != 0) nomem(); namelen = strlen(name); } nameidx = us_field_index("name"); if (nameidx >= 0 && namelen > cb->cb_width[nameidx]) cb->cb_width[nameidx] = namelen; /* * Check if this type/name combination is in the list and update it; * otherwise add new node to the list. */ if ((n = uu_avl_find(avl, node, &sortinfo, &idx)) == NULL) { uu_avl_insert(avl, node, idx); } else { nvlist_free(props); free(node); node = n; props = node->usn_nvl; } /* Calculate/update width of USED/QUOTA fields */ if (cb->cb_nicenum) { if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_USERQUOTA || prop == ZFS_PROP_GROUPQUOTA || prop == ZFS_PROP_PROJECTUSED || prop == ZFS_PROP_PROJECTQUOTA) { zfs_nicebytes(space, sizebuf, sizeof (sizebuf)); } else { zfs_nicenum(space, sizebuf, sizeof (sizebuf)); } } else { (void) snprintf(sizebuf, sizeof (sizebuf), "%llu", (u_longlong_t)space); } sizelen = strlen(sizebuf); if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_PROJECTUSED) { propname = "used"; if (!nvlist_exists(props, "quota")) (void) nvlist_add_uint64(props, "quota", default_quota); } else if (prop == ZFS_PROP_USERQUOTA || prop == ZFS_PROP_GROUPQUOTA || prop == ZFS_PROP_PROJECTQUOTA) { propname = "quota"; if (!nvlist_exists(props, "used")) (void) nvlist_add_uint64(props, "used", 0); } else if (prop == ZFS_PROP_USEROBJUSED || prop == ZFS_PROP_GROUPOBJUSED || prop == ZFS_PROP_PROJECTOBJUSED) { propname = "objused"; if (!nvlist_exists(props, "objquota")) { (void) nvlist_add_uint64(props, "objquota", default_quota); } } else if (prop == ZFS_PROP_USEROBJQUOTA || prop == ZFS_PROP_GROUPOBJQUOTA || prop == ZFS_PROP_PROJECTOBJQUOTA) { propname = "objquota"; if (!nvlist_exists(props, "objused")) (void) nvlist_add_uint64(props, "objused", 0); } else { return (-1); } sizeidx = us_field_index(propname); if (sizeidx >= 0 && sizelen > cb->cb_width[sizeidx]) cb->cb_width[sizeidx] = sizelen; if (nvlist_add_uint64(props, propname, space) != 0) nomem(); return (0); } static void print_us_node(boolean_t scripted, boolean_t parsable, int *fields, int types, size_t *width, us_node_t *node) { nvlist_t *nvl = node->usn_nvl; char valstr[MAXNAMELEN]; boolean_t first = B_TRUE; int cfield = 0; int field; uint32_t ustype; /* Check type */ (void) nvlist_lookup_uint32(nvl, "type", &ustype); if (!(ustype & types)) return; while ((field = fields[cfield]) != USFIELD_LAST) { nvpair_t *nvp = NULL; data_type_t type; uint32_t val32 = -1; uint64_t val64 = -1; const char *strval = "-"; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) if (strcmp(nvpair_name(nvp), us_field_names[field]) == 0) break; type = nvp == NULL ? DATA_TYPE_UNKNOWN : nvpair_type(nvp); switch (type) { case DATA_TYPE_UINT32: val32 = fnvpair_value_uint32(nvp); break; case DATA_TYPE_UINT64: val64 = fnvpair_value_uint64(nvp); break; case DATA_TYPE_STRING: strval = fnvpair_value_string(nvp); break; case DATA_TYPE_UNKNOWN: break; default: (void) fprintf(stderr, "invalid data type\n"); } switch (field) { case USFIELD_TYPE: if (type == DATA_TYPE_UINT32) strval = us_type2str(val32); break; case USFIELD_NAME: if (type == DATA_TYPE_UINT64) { (void) sprintf(valstr, "%llu", (u_longlong_t)val64); strval = valstr; } break; case USFIELD_USED: case USFIELD_QUOTA: if (type == DATA_TYPE_UINT64) { if (parsable) { (void) sprintf(valstr, "%llu", (u_longlong_t)val64); strval = valstr; } else if (field == USFIELD_QUOTA && val64 == 0) { strval = "none"; } else { zfs_nicebytes(val64, valstr, sizeof (valstr)); strval = valstr; } } break; case USFIELD_OBJUSED: case USFIELD_OBJQUOTA: if (type == DATA_TYPE_UINT64) { if (parsable) { (void) sprintf(valstr, "%llu", (u_longlong_t)val64); strval = valstr; } else if (field == USFIELD_OBJQUOTA && val64 == 0) { strval = "none"; } else { zfs_nicenum(val64, valstr, sizeof (valstr)); strval = valstr; } } break; } if (!first) { if (scripted) (void) putchar('\t'); else (void) fputs(" ", stdout); } if (scripted) (void) fputs(strval, stdout); else if (field == USFIELD_TYPE || field == USFIELD_NAME) (void) printf("%-*s", (int)width[field], strval); else (void) printf("%*s", (int)width[field], strval); first = B_FALSE; cfield++; } (void) putchar('\n'); } static void print_us(boolean_t scripted, boolean_t parsable, int *fields, int types, size_t *width, boolean_t rmnode, uu_avl_t *avl) { us_node_t *node; const char *col; int cfield = 0; int field; if (!scripted) { boolean_t first = B_TRUE; while ((field = fields[cfield]) != USFIELD_LAST) { col = gettext(us_field_hdr[field]); if (field == USFIELD_TYPE || field == USFIELD_NAME) { (void) printf(first ? "%-*s" : " %-*s", (int)width[field], col); } else { (void) printf(first ? "%*s" : " %*s", (int)width[field], col); } first = B_FALSE; cfield++; } (void) printf("\n"); } for (node = uu_avl_first(avl); node; node = uu_avl_next(avl, node)) { print_us_node(scripted, parsable, fields, types, width, node); if (rmnode) nvlist_free(node->usn_nvl); } } static int zfs_do_userspace(int argc, char **argv) { zfs_handle_t *zhp; zfs_userquota_prop_t p; uu_avl_pool_t *avl_pool; uu_avl_t *avl_tree; uu_avl_walk_t *walk; char *delim; char deffields[] = "type,name,used,quota,objused,objquota"; char *ofield = NULL; char *tfield = NULL; int cfield = 0; int fields[256]; int i; boolean_t scripted = B_FALSE; boolean_t prtnum = B_FALSE; boolean_t parsable = B_FALSE; boolean_t sid2posix = B_FALSE; int ret = 0; int c; zfs_sort_column_t *sortcol = NULL; int types = USTYPE_PSX_USR | USTYPE_SMB_USR; us_cbdata_t cb; us_node_t *node; us_node_t *rmnode; uu_list_pool_t *listpool; uu_list_t *list; uu_avl_index_t idx = 0; uu_list_index_t idx2 = 0; if (argc < 2) usage(B_FALSE); if (strcmp(argv[0], "groupspace") == 0) { /* Toggle default group types */ types = USTYPE_PSX_GRP | USTYPE_SMB_GRP; } else if (strcmp(argv[0], "projectspace") == 0) { types = USTYPE_PROJ; prtnum = B_TRUE; } while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) { switch (c) { case 'n': if (types == USTYPE_PROJ) { (void) fprintf(stderr, gettext("invalid option 'n'\n")); usage(B_FALSE); } prtnum = B_TRUE; break; case 'H': scripted = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 'o': ofield = optarg; break; case 's': case 'S': if (zfs_add_sort_column(&sortcol, optarg, c == 's' ? B_FALSE : B_TRUE) != 0) { (void) fprintf(stderr, gettext("invalid field '%s'\n"), optarg); usage(B_FALSE); } break; case 't': if (types == USTYPE_PROJ) { (void) fprintf(stderr, gettext("invalid option 't'\n")); usage(B_FALSE); } tfield = optarg; break; case 'i': if (types == USTYPE_PROJ) { (void) fprintf(stderr, gettext("invalid option 'i'\n")); usage(B_FALSE); } sid2posix = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing dataset name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* Use default output fields if not specified using -o */ if (ofield == NULL) ofield = deffields; do { if ((delim = strchr(ofield, ',')) != NULL) *delim = '\0'; if ((fields[cfield++] = us_field_index(ofield)) == -1) { (void) fprintf(stderr, gettext("invalid type '%s' " "for -o option\n"), ofield); return (-1); } if (delim != NULL) ofield = delim + 1; } while (delim != NULL); fields[cfield] = USFIELD_LAST; /* Override output types (-t option) */ if (tfield != NULL) { types = 0; do { boolean_t found = B_FALSE; if ((delim = strchr(tfield, ',')) != NULL) *delim = '\0'; for (i = 0; i < sizeof (us_type_bits) / sizeof (int); i++) { if (strcmp(tfield, us_type_names[i]) == 0) { found = B_TRUE; types |= us_type_bits[i]; break; } } if (!found) { (void) fprintf(stderr, gettext("invalid type " "'%s' for -t option\n"), tfield); return (-1); } if (delim != NULL) tfield = delim + 1; } while (delim != NULL); } if ((zhp = zfs_path_to_zhandle(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT)) == NULL) return (1); if (zfs_get_underlying_type(zhp) != ZFS_TYPE_FILESYSTEM) { (void) fprintf(stderr, gettext("operation is only applicable " "to filesystems and their snapshots\n")); zfs_close(zhp); return (1); } if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t), offsetof(us_node_t, usn_avlnode), us_compare, UU_DEFAULT)) == NULL) nomem(); if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) nomem(); /* Always add default sorting columns */ (void) zfs_add_sort_column(&sortcol, "type", B_FALSE); (void) zfs_add_sort_column(&sortcol, "name", B_FALSE); cb.cb_sortcol = sortcol; cb.cb_numname = prtnum; cb.cb_nicenum = !parsable; cb.cb_avl_pool = avl_pool; cb.cb_avl = avl_tree; cb.cb_sid2posix = sid2posix; for (i = 0; i < USFIELD_LAST; i++) cb.cb_width[i] = strlen(gettext(us_field_hdr[i])); for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) { if ((zfs_prop_is_user(p) && !(types & (USTYPE_PSX_USR | USTYPE_SMB_USR))) || (zfs_prop_is_group(p) && !(types & (USTYPE_PSX_GRP | USTYPE_SMB_GRP))) || (zfs_prop_is_project(p) && types != USTYPE_PROJ)) continue; cb.cb_prop = p; if ((ret = zfs_userspace(zhp, p, userspace_cb, &cb)) != 0) { zfs_close(zhp); return (ret); } } zfs_close(zhp); /* Sort the list */ if ((node = uu_avl_first(avl_tree)) == NULL) return (0); us_populated = B_TRUE; listpool = uu_list_pool_create("tmplist", sizeof (us_node_t), offsetof(us_node_t, usn_listnode), NULL, UU_DEFAULT); list = uu_list_create(listpool, NULL, UU_DEFAULT); uu_list_node_init(node, &node->usn_listnode, listpool); while (node != NULL) { rmnode = node; node = uu_avl_next(avl_tree, node); uu_avl_remove(avl_tree, rmnode); if (uu_list_find(list, rmnode, NULL, &idx2) == NULL) uu_list_insert(list, rmnode, idx2); } for (node = uu_list_first(list); node != NULL; node = uu_list_next(list, node)) { us_sort_info_t sortinfo = { sortcol, cb.cb_numname }; if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == NULL) uu_avl_insert(avl_tree, node, idx); } uu_list_destroy(list); uu_list_pool_destroy(listpool); /* Print and free node nvlist memory */ print_us(scripted, parsable, fields, types, cb.cb_width, B_TRUE, cb.cb_avl); zfs_free_sort_columns(sortcol); /* Clean up the AVL tree */ if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) nomem(); while ((node = uu_avl_walk_next(walk)) != NULL) { uu_avl_remove(cb.cb_avl, node); free(node); } uu_avl_walk_end(walk); uu_avl_destroy(avl_tree); uu_avl_pool_destroy(avl_pool); return (ret); } /* * list [-Hp][-r|-d max] [-o property[,...]] [-s property] ... [-S property] * [-t type[,...]] [filesystem|volume|snapshot] ... * * -H Scripted mode; elide headers and separate columns by tabs * -p Display values in parsable (literal) format. * -r Recurse over all children * -d Limit recursion by depth. * -o Control which fields to display. * -s Specify sort columns, descending order. * -S Specify sort columns, ascending order. * -t Control which object types to display. * * When given no arguments, list all filesystems in the system. * Otherwise, list the specified datasets, optionally recursing down them if * '-r' is specified. */ typedef struct list_cbdata { boolean_t cb_first; boolean_t cb_literal; boolean_t cb_scripted; zprop_list_t *cb_proplist; boolean_t cb_json; nvlist_t *cb_jsobj; boolean_t cb_json_as_int; } list_cbdata_t; /* * Given a list of columns to display, output appropriate headers for each one. */ static void print_header(list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; char headerbuf[ZFS_MAXPROPLEN]; const char *header; int i; boolean_t first = B_TRUE; boolean_t right_justify; color_start(ANSI_BOLD); for (; pl != NULL; pl = pl->pl_next) { if (!first) { (void) printf(" "); } else { first = B_FALSE; } right_justify = B_FALSE; if (pl->pl_prop != ZPROP_USERPROP) { header = zfs_prop_column_name(pl->pl_prop); right_justify = zfs_prop_align_right(pl->pl_prop); } else { for (i = 0; pl->pl_user_prop[i] != '\0'; i++) headerbuf[i] = toupper(pl->pl_user_prop[i]); headerbuf[i] = '\0'; header = headerbuf; } if (pl->pl_next == NULL && !right_justify) (void) printf("%s", header); else if (right_justify) (void) printf("%*s", (int)pl->pl_width, header); else (void) printf("%-*s", (int)pl->pl_width, header); } color_end(); (void) printf("\n"); } /* * Decides on the color that the avail value should be printed in. * > 80% used = yellow * > 90% used = red */ static const char * zfs_list_avail_color(zfs_handle_t *zhp) { uint64_t used = zfs_prop_get_int(zhp, ZFS_PROP_USED); uint64_t avail = zfs_prop_get_int(zhp, ZFS_PROP_AVAILABLE); int percentage = (int)((double)avail / MAX(avail + used, 1) * 100); if (percentage > 20) return (NULL); else if (percentage > 10) return (ANSI_YELLOW); else return (ANSI_RED); } /* * Given a dataset and a list of fields, print out all the properties according * to the described layout, or return an nvlist containing all the fields, later * to be printed out as JSON object. */ static void collect_dataset(zfs_handle_t *zhp, list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; boolean_t first = B_TRUE; char property[ZFS_MAXPROPLEN]; nvlist_t *userprops = zfs_get_user_props(zhp); nvlist_t *propval; const char *propstr; boolean_t right_justify; nvlist_t *item, *d, *props; item = d = props = NULL; zprop_source_t sourcetype = ZPROP_SRC_NONE; char source[ZFS_MAX_DATASET_NAME_LEN]; if (cb->cb_json) { d = fnvlist_lookup_nvlist(cb->cb_jsobj, "datasets"); if (d == NULL) { fprintf(stderr, "datasets obj not found.\n"); exit(1); } item = fnvlist_alloc(); props = fnvlist_alloc(); fill_dataset_info(item, zhp, cb->cb_json_as_int); } for (; pl != NULL; pl = pl->pl_next) { if (!cb->cb_json && !first) { if (cb->cb_scripted) (void) putchar('\t'); else (void) fputs(" ", stdout); } else { first = B_FALSE; } if (pl->pl_prop == ZFS_PROP_NAME) { (void) strlcpy(property, zfs_get_name(zhp), sizeof (property)); propstr = property; right_justify = zfs_prop_align_right(pl->pl_prop); } else if (pl->pl_prop != ZPROP_USERPROP) { if (zfs_prop_get(zhp, pl->pl_prop, property, sizeof (property), &sourcetype, source, sizeof (source), cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = zfs_prop_align_right(pl->pl_prop); } else if (zfs_prop_userquota(pl->pl_user_prop)) { sourcetype = ZPROP_SRC_LOCAL; if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, property, sizeof (property), cb->cb_literal) != 0) { sourcetype = ZPROP_SRC_NONE; propstr = "-"; } else { propstr = property; } right_justify = B_TRUE; } else if (zfs_prop_written(pl->pl_user_prop)) { sourcetype = ZPROP_SRC_LOCAL; if (zfs_prop_get_written(zhp, pl->pl_user_prop, property, sizeof (property), cb->cb_literal) != 0) { sourcetype = ZPROP_SRC_NONE; propstr = "-"; } else { propstr = property; } right_justify = B_TRUE; } else { if (nvlist_lookup_nvlist(userprops, pl->pl_user_prop, &propval) != 0) { propstr = "-"; } else { propstr = fnvlist_lookup_string(propval, ZPROP_VALUE); strlcpy(source, fnvlist_lookup_string(propval, ZPROP_SOURCE), ZFS_MAX_DATASET_NAME_LEN); if (strcmp(source, zfs_get_name(zhp)) == 0) { sourcetype = ZPROP_SRC_LOCAL; } else if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) { sourcetype = ZPROP_SRC_RECEIVED; } else { sourcetype = ZPROP_SRC_INHERITED; } } right_justify = B_FALSE; } if (cb->cb_json) { if (pl->pl_prop == ZFS_PROP_NAME) continue; const char *prop_name; if (pl->pl_prop != ZPROP_USERPROP) prop_name = zfs_prop_to_name(pl->pl_prop); else prop_name = pl->pl_user_prop; if (zprop_nvlist_one_property( prop_name, propstr, sourcetype, source, NULL, props, cb->cb_json_as_int) != 0) nomem(); } else { /* * zfs_list_avail_color() needs * ZFS_PROP_AVAILABLE + USED, so we need another * for() search for the USED part when no colors * wanted, we can skip the whole thing */ if (use_color() && pl->pl_prop == ZFS_PROP_AVAILABLE) { zprop_list_t *pl2 = cb->cb_proplist; for (; pl2 != NULL; pl2 = pl2->pl_next) { if (pl2->pl_prop == ZFS_PROP_USED) { color_start( zfs_list_avail_color(zhp)); /* * found it, no need for more * loops */ break; } } } /* * If this is being called in scripted mode, or if * this is the last column and it is left-justified, * don't include a width format specifier. */ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) (void) fputs(propstr, stdout); else if (right_justify) { (void) printf("%*s", (int)pl->pl_width, propstr); } else { (void) printf("%-*s", (int)pl->pl_width, propstr); } if (pl->pl_prop == ZFS_PROP_AVAILABLE) color_end(); } } if (cb->cb_json) { fnvlist_add_nvlist(item, "properties", props); fnvlist_add_nvlist(d, zfs_get_name(zhp), item); fnvlist_free(props); fnvlist_free(item); } else (void) putchar('\n'); } /* * Generic callback function to list a dataset or snapshot. */ static int list_callback(zfs_handle_t *zhp, void *data) { list_cbdata_t *cbp = data; if (cbp->cb_first) { if (!cbp->cb_scripted && !cbp->cb_json) print_header(cbp); cbp->cb_first = B_FALSE; } collect_dataset(zhp, cbp); return (0); } static int zfs_do_list(int argc, char **argv) { int c; char default_fields[] = "name,used,available,referenced,mountpoint"; int types = ZFS_TYPE_DATASET; boolean_t types_specified = B_FALSE; char *fields = default_fields; list_cbdata_t cb = { 0 }; int limit = 0; int ret = 0; zfs_sort_column_t *sortcol = NULL; int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS; nvlist_t *data = NULL; struct option long_options[] = { {"json", no_argument, NULL, 'j'}, {"json-int", no_argument, NULL, ZFS_OPTION_JSON_NUMS_AS_INT}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, "jHS:d:o:prs:t:", long_options, NULL)) != -1) { switch (c) { case 'o': fields = optarg; break; case 'p': cb.cb_literal = B_TRUE; flags |= ZFS_ITER_LITERAL_PROPS; break; case 'd': limit = parse_depth(optarg, &flags); break; case 'r': flags |= ZFS_ITER_RECURSE; break; case 'j': cb.cb_json = B_TRUE; cb.cb_jsobj = zfs_json_schema(0, 1); data = fnvlist_alloc(); fnvlist_add_nvlist(cb.cb_jsobj, "datasets", data); fnvlist_free(data); break; case ZFS_OPTION_JSON_NUMS_AS_INT: cb.cb_json_as_int = B_TRUE; cb.cb_literal = B_TRUE; break; case 'H': cb.cb_scripted = B_TRUE; break; case 's': if (zfs_add_sort_column(&sortcol, optarg, B_FALSE) != 0) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), optarg); usage(B_FALSE); } break; case 'S': if (zfs_add_sort_column(&sortcol, optarg, B_TRUE) != 0) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), optarg); usage(B_FALSE); } break; case 't': types = 0; types_specified = B_TRUE; flags &= ~ZFS_ITER_PROP_LISTSNAPS; for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const type_subopts[] = { "filesystem", "fs", "volume", "vol", "snapshot", "snap", "bookmark", "all" }; static const int type_types[] = { ZFS_TYPE_FILESYSTEM, ZFS_TYPE_FILESYSTEM, ZFS_TYPE_VOLUME, ZFS_TYPE_VOLUME, ZFS_TYPE_SNAPSHOT, ZFS_TYPE_SNAPSHOT, ZFS_TYPE_BOOKMARK, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK }; for (c = 0; c < ARRAY_SIZE(type_subopts); ++c) if (strcmp(tok, type_subopts[c]) == 0) { types |= type_types[c]; goto found3; } (void) fprintf(stderr, gettext("invalid type '%s'\n"), tok); usage(B_FALSE); found3:; } break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (!cb.cb_json && cb.cb_json_as_int) { (void) fprintf(stderr, gettext("'--json-int' only works with" " '-j' option\n")); usage(B_FALSE); } /* * If "-o space" and no types were specified, don't display snapshots. */ if (strcmp(fields, "space") == 0 && types_specified == B_FALSE) types &= ~ZFS_TYPE_SNAPSHOT; /* * Handle users who want to list all snapshots or bookmarks * of the current dataset (ex. 'zfs list -t snapshot '). */ if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) && argc > 0 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) { flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE); limit = 1; } /* * If the user specifies '-o all', the zprop_get_list() doesn't * normally include the name of the dataset. For 'zfs list', we always * want this property to be first. */ if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) != 0) usage(B_FALSE); cb.cb_first = B_TRUE; /* * If we are only going to list and sort by properties that are "fast" * then we can use "simple" mode and avoid populating the properties * nvlist. */ if (zfs_list_only_by_fast(cb.cb_proplist) && zfs_sort_only_by_fast(sortcol)) flags |= ZFS_ITER_SIMPLE; ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist, limit, list_callback, &cb); if (ret == 0 && cb.cb_json) zcmd_print_json(cb.cb_jsobj); else if (ret != 0 && cb.cb_json) nvlist_free(cb.cb_jsobj); zprop_free_list(cb.cb_proplist); zfs_free_sort_columns(sortcol); if (ret == 0 && cb.cb_first && !cb.cb_scripted) (void) fprintf(stderr, gettext("no datasets available\n")); return (ret); } /* * zfs rename [-fu] * zfs rename [-f] -p * zfs rename [-u] -r * * Renames the given dataset to another of the same type. * * The '-p' flag creates all the non-existing ancestors of the target first. * The '-u' flag prevents file systems from being remounted during rename. */ static int zfs_do_rename(int argc, char **argv) { zfs_handle_t *zhp; renameflags_t flags = { 0 }; int c; int ret = 0; int types; boolean_t parents = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "pruf")) != -1) { switch (c) { case 'p': parents = B_TRUE; break; case 'r': flags.recursive = B_TRUE; break; case 'u': flags.nounmount = B_TRUE; break; case 'f': flags.forceunmount = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing source dataset " "argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing target dataset " "argument\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (flags.recursive && parents) { (void) fprintf(stderr, gettext("-p and -r options are mutually " "exclusive\n")); usage(B_FALSE); } if (flags.nounmount && parents) { (void) fprintf(stderr, gettext("-u and -p options are mutually " "exclusive\n")); usage(B_FALSE); } if (flags.recursive && strchr(argv[0], '@') == 0) { (void) fprintf(stderr, gettext("source dataset for recursive " "rename must be a snapshot\n")); usage(B_FALSE); } if (flags.nounmount) types = ZFS_TYPE_FILESYSTEM; else if (parents) types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; else types = ZFS_TYPE_DATASET; if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL) return (1); /* If we were asked and the name looks good, try to create ancestors. */ if (parents && zfs_name_valid(argv[1], zfs_get_type(zhp)) && zfs_create_ancestors(g_zfs, argv[1]) != 0) { zfs_close(zhp); return (1); } ret = (zfs_rename(zhp, argv[1], flags) != 0); zfs_close(zhp); return (ret); } /* * zfs promote * * Promotes the given clone fs to be the parent */ static int zfs_do_promote(int argc, char **argv) { zfs_handle_t *zhp; int ret = 0; /* check options */ if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } /* check number of arguments */ if (argc < 2) { (void) fprintf(stderr, gettext("missing clone filesystem" " argument\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (1); ret = (zfs_promote(zhp) != 0); zfs_close(zhp); return (ret); } static int zfs_do_redact(int argc, char **argv) { char *snap = NULL; char *bookname = NULL; char **rsnaps = NULL; int numrsnaps = 0; argv++; argc--; if (argc < 3) { (void) fprintf(stderr, gettext("too few arguments\n")); usage(B_FALSE); } snap = argv[0]; bookname = argv[1]; rsnaps = argv + 2; numrsnaps = argc - 2; nvlist_t *rsnapnv = fnvlist_alloc(); for (int i = 0; i < numrsnaps; i++) { fnvlist_add_boolean(rsnapnv, rsnaps[i]); } int err = lzc_redact(snap, bookname, rsnapnv); fnvlist_free(rsnapnv); switch (err) { case 0: break; case ENOENT: { zfs_handle_t *zhp = zfs_open(g_zfs, snap, ZFS_TYPE_SNAPSHOT); if (zhp == NULL) { (void) fprintf(stderr, gettext("provided snapshot %s " "does not exist\n"), snap); } else { zfs_close(zhp); } for (int i = 0; i < numrsnaps; i++) { zhp = zfs_open(g_zfs, rsnaps[i], ZFS_TYPE_SNAPSHOT); if (zhp == NULL) { (void) fprintf(stderr, gettext("provided " "snapshot %s does not exist\n"), rsnaps[i]); } else { zfs_close(zhp); } } break; } case EEXIST: (void) fprintf(stderr, gettext("specified redaction bookmark " "(%s) provided already exists\n"), bookname); break; case ENAMETOOLONG: (void) fprintf(stderr, gettext("provided bookmark name cannot " "be used, final name would be too long\n")); break; case E2BIG: (void) fprintf(stderr, gettext("too many redaction snapshots " "specified\n")); break; case EINVAL: if (strchr(bookname, '#') != NULL) (void) fprintf(stderr, gettext( "redaction bookmark name must not contain '#'\n")); else (void) fprintf(stderr, gettext( "redaction snapshot must be descendent of " "snapshot being redacted\n")); break; case EALREADY: (void) fprintf(stderr, gettext("attempted to redact redacted " "dataset or with respect to redacted dataset\n")); break; case ENOTSUP: (void) fprintf(stderr, gettext("redaction bookmarks feature " "not enabled\n")); break; case EXDEV: (void) fprintf(stderr, gettext("potentially invalid redaction " "snapshot; full dataset names required\n")); break; case ESRCH: (void) fprintf(stderr, gettext("attempted to resume redaction " " with a mismatched redaction list\n")); break; default: (void) fprintf(stderr, gettext("internal error: %s\n"), strerror(errno)); } return (err); } /* * zfs rollback [-rRf] * * -r Delete any intervening snapshots before doing rollback * -R Delete any snapshots and their clones * -f ignored for backwards compatibility * * Given a filesystem, rollback to a specific snapshot, discarding any changes * since then and making it the active dataset. If more recent snapshots exist, * the command will complain unless the '-r' flag is given. */ typedef struct rollback_cbdata { uint64_t cb_create; uint8_t cb_younger_ds_printed; boolean_t cb_first; int cb_doclones; char *cb_target; int cb_error; boolean_t cb_recurse; } rollback_cbdata_t; static int rollback_check_dependent(zfs_handle_t *zhp, void *data) { rollback_cbdata_t *cbp = data; if (cbp->cb_first && cbp->cb_recurse) { (void) fprintf(stderr, gettext("cannot rollback to " "'%s': clones of previous snapshots exist\n"), cbp->cb_target); (void) fprintf(stderr, gettext("use '-R' to " "force deletion of the following clones and " "dependents:\n")); cbp->cb_first = 0; cbp->cb_error = 1; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); zfs_close(zhp); return (0); } /* * Report some snapshots/bookmarks more recent than the one specified. * Used when '-r' is not specified. We reuse this same callback for the * snapshot dependents - if 'cb_dependent' is set, then this is a * dependent and we should report it without checking the transaction group. */ static int rollback_check(zfs_handle_t *zhp, void *data) { rollback_cbdata_t *cbp = data; /* * Max number of younger snapshots and/or bookmarks to display before * we stop the iteration. */ const uint8_t max_younger = 32; if (cbp->cb_doclones) { zfs_close(zhp); return (0); } if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { if (cbp->cb_first && !cbp->cb_recurse) { (void) fprintf(stderr, gettext("cannot " "rollback to '%s': more recent snapshots " "or bookmarks exist\n"), cbp->cb_target); (void) fprintf(stderr, gettext("use '-r' to " "force deletion of the following " "snapshots and bookmarks:\n")); cbp->cb_first = 0; cbp->cb_error = 1; } if (cbp->cb_recurse) { if (zfs_iter_dependents_v2(zhp, 0, B_TRUE, rollback_check_dependent, cbp) != 0) { zfs_close(zhp); return (-1); } } else { (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); cbp->cb_younger_ds_printed++; } } zfs_close(zhp); if (cbp->cb_younger_ds_printed == max_younger) { /* * This non-recursive rollback is going to fail due to the * presence of snapshots and/or bookmarks that are younger than * the rollback target. * We printed some of the offending objects, now we stop * zfs_iter_snapshot/bookmark iteration so we can fail fast and * avoid iterating over the rest of the younger objects */ (void) fprintf(stderr, gettext("Output limited to %d " "snapshots/bookmarks\n"), max_younger); return (-1); } return (0); } static int zfs_do_rollback(int argc, char **argv) { int ret = 0; int c; boolean_t force = B_FALSE; rollback_cbdata_t cb = { 0 }; zfs_handle_t *zhp, *snap; char parentname[ZFS_MAX_DATASET_NAME_LEN]; char *delim; uint64_t min_txg = 0; /* check options */ while ((c = getopt(argc, argv, "rRf")) != -1) { switch (c) { case 'r': cb.cb_recurse = 1; break; case 'R': cb.cb_recurse = 1; cb.cb_doclones = 1; break; case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* open the snapshot */ if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) return (1); /* open the parent dataset */ (void) strlcpy(parentname, argv[0], sizeof (parentname)); verify((delim = strrchr(parentname, '@')) != NULL); *delim = '\0'; if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_DATASET)) == NULL) { zfs_close(snap); return (1); } /* * Check for more recent snapshots and/or clones based on the presence * of '-r' and '-R'. */ cb.cb_target = argv[0]; cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); cb.cb_first = B_TRUE; cb.cb_error = 0; if (cb.cb_create > 0) min_txg = cb.cb_create; if ((ret = zfs_iter_snapshots_sorted_v2(zhp, 0, rollback_check, &cb, min_txg, 0)) != 0) goto out; if ((ret = zfs_iter_bookmarks_v2(zhp, 0, rollback_check, &cb)) != 0) goto out; if ((ret = cb.cb_error) != 0) goto out; /* * Rollback parent to the given snapshot. */ ret = zfs_rollback(zhp, snap, force); out: zfs_close(snap); zfs_close(zhp); if (ret == 0) return (0); else return (1); } /* * zfs set property=value ... { fs | snap | vol } ... * * Sets the given properties for all datasets specified on the command line. */ static int set_callback(zfs_handle_t *zhp, void *data) { zprop_set_cbdata_t *cb = data; int ret = zfs_prop_set_list_flags(zhp, cb->cb_proplist, cb->cb_flags); if (ret != 0 || libzfs_errno(g_zfs) != EZFS_SUCCESS) { switch (libzfs_errno(g_zfs)) { case EZFS_MOUNTFAILED: (void) fprintf(stderr, gettext("property may be set " "but unable to remount filesystem\n")); break; case EZFS_SHARENFSFAILED: (void) fprintf(stderr, gettext("property may be set " "but unable to reshare filesystem\n")); break; } } return (ret); } static int zfs_do_set(int argc, char **argv) { zprop_set_cbdata_t cb = { 0 }; int ds_start = -1; /* argv idx of first dataset arg */ int ret = 0; int i, c; /* check options */ while ((c = getopt(argc, argv, "u")) != -1) { switch (c) { case 'u': cb.cb_flags |= ZFS_SET_NOMOUNT; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing arguments\n")); usage(B_FALSE); } if (argc < 2) { if (strchr(argv[0], '=') == NULL) { (void) fprintf(stderr, gettext("missing property=value " "argument(s)\n")); } else { (void) fprintf(stderr, gettext("missing dataset " "name(s)\n")); } usage(B_FALSE); } /* validate argument order: prop=val args followed by dataset args */ for (i = 0; i < argc; i++) { if (strchr(argv[i], '=') != NULL) { if (ds_start > 0) { /* out-of-order prop=val argument */ (void) fprintf(stderr, gettext("invalid " "argument order\n")); usage(B_FALSE); } } else if (ds_start < 0) { ds_start = i; } } if (ds_start < 0) { (void) fprintf(stderr, gettext("missing dataset name(s)\n")); usage(B_FALSE); } /* Populate a list of property settings */ if (nvlist_alloc(&cb.cb_proplist, NV_UNIQUE_NAME, 0) != 0) nomem(); for (i = 0; i < ds_start; i++) { if (!parseprop(cb.cb_proplist, argv[i])) { ret = -1; goto error; } } ret = zfs_for_each(argc - ds_start, argv + ds_start, 0, ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, &cb); error: nvlist_free(cb.cb_proplist); return (ret); } typedef struct snap_cbdata { nvlist_t *sd_nvl; boolean_t sd_recursive; const char *sd_snapname; } snap_cbdata_t; static int zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) { snap_cbdata_t *sd = arg; char *name; int rv = 0; int error; if (sd->sd_recursive && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) { zfs_close(zhp); return (0); } error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname); if (error == -1) nomem(); fnvlist_add_boolean(sd->sd_nvl, name); free(name); if (sd->sd_recursive) rv = zfs_iter_filesystems_v2(zhp, 0, zfs_snapshot_cb, sd); zfs_close(zhp); return (rv); } /* * zfs snapshot [-r] [-o prop=value] ... * * Creates a snapshot with the given name. While functionally equivalent to * 'zfs create', it is a separate command to differentiate intent. */ static int zfs_do_snapshot(int argc, char **argv) { int ret = 0; int c; nvlist_t *props; snap_cbdata_t sd = { 0 }; boolean_t multiple_snaps = B_FALSE; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, "ro:")) != -1) { switch (c) { case 'o': if (!parseprop(props, optarg)) { nvlist_free(sd.sd_nvl); nvlist_free(props); return (1); } break; case 'r': sd.sd_recursive = B_TRUE; multiple_snaps = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing snapshot argument\n")); goto usage; } if (argc > 1) multiple_snaps = B_TRUE; for (; argc > 0; argc--, argv++) { char *atp; zfs_handle_t *zhp; atp = strchr(argv[0], '@'); if (atp == NULL) goto usage; *atp = '\0'; sd.sd_snapname = atp + 1; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) goto usage; if (zfs_snapshot_cb(zhp, &sd) != 0) goto usage; } ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props); nvlist_free(sd.sd_nvl); nvlist_free(props); if (ret != 0 && multiple_snaps) (void) fprintf(stderr, gettext("no snapshots were created\n")); return (ret != 0); usage: nvlist_free(sd.sd_nvl); nvlist_free(props); usage(B_FALSE); return (-1); } /* * Array of prefixes to exclude – * a linear search, even if executed for each dataset, * is plenty good enough. */ typedef struct zfs_send_exclude_arg { size_t count; const char **list; } zfs_send_exclude_arg_t; static boolean_t zfs_do_send_exclude(zfs_handle_t *zhp, void *context) { zfs_send_exclude_arg_t *excludes = context; const char *name = zfs_get_name(zhp); for (size_t i = 0; i < excludes->count; ++i) { size_t len = strlen(excludes->list[i]); if (strncmp(name, excludes->list[i], len) == 0 && memchr("/@", name[len], sizeof ("/@"))) return (B_FALSE); } return (B_TRUE); } /* * Send a backup stream to stdout. */ static int zfs_do_send(int argc, char **argv) { char *fromname = NULL; char *toname = NULL; char *resume_token = NULL; char *cp; zfs_handle_t *zhp; sendflags_t flags = { 0 }; int c, err; nvlist_t *dbgnv = NULL; char *redactbook = NULL; zfs_send_exclude_arg_t excludes = { 0 }; struct option long_options[] = { {"replicate", no_argument, NULL, 'R'}, {"skip-missing", no_argument, NULL, 's'}, {"redact", required_argument, NULL, 'd'}, {"props", no_argument, NULL, 'p'}, {"parsable", no_argument, NULL, 'P'}, {"dedup", no_argument, NULL, 'D'}, {"proctitle", no_argument, NULL, 'V'}, {"verbose", no_argument, NULL, 'v'}, {"dryrun", no_argument, NULL, 'n'}, {"large-block", no_argument, NULL, 'L'}, {"embed", no_argument, NULL, 'e'}, {"resume", required_argument, NULL, 't'}, {"compressed", no_argument, NULL, 'c'}, {"raw", no_argument, NULL, 'w'}, {"backup", no_argument, NULL, 'b'}, {"holds", no_argument, NULL, 'h'}, {"saved", no_argument, NULL, 'S'}, {"exclude", required_argument, NULL, 'X'}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, ":i:I:RsDpVvnPLeht:cwbd:SX:", long_options, NULL)) != -1) { switch (c) { case 'X': for (char *ds; (ds = strsep(&optarg, ",")) != NULL; ) { if (!zfs_name_valid(ds, ZFS_TYPE_DATASET) || strchr(ds, '/') == NULL) { (void) fprintf(stderr, gettext("-X %s: " "not a valid non-root dataset name" ".\n"), ds); usage(B_FALSE); } excludes.list = safe_realloc(excludes.list, sizeof (char *) * (excludes.count + 1)); excludes.list[excludes.count++] = ds; } break; case 'i': if (fromname) usage(B_FALSE); fromname = optarg; break; case 'I': if (fromname) usage(B_FALSE); fromname = optarg; flags.doall = B_TRUE; break; case 'R': flags.replicate = B_TRUE; break; case 's': flags.skipmissing = B_TRUE; break; case 'd': redactbook = optarg; break; case 'p': flags.props = B_TRUE; break; case 'b': flags.backup = B_TRUE; break; case 'h': flags.holds = B_TRUE; break; case 'P': flags.parsable = B_TRUE; break; case 'V': flags.progressastitle = B_TRUE; break; case 'v': flags.verbosity++; flags.progress = B_TRUE; break; case 'D': (void) fprintf(stderr, gettext("WARNING: deduplicated send is no " "longer supported. A regular,\n" "non-deduplicated stream will be generated.\n\n")); break; case 'n': flags.dryrun = B_TRUE; break; case 'L': flags.largeblock = B_TRUE; break; case 'e': flags.embed_data = B_TRUE; break; case 't': resume_token = optarg; break; case 'c': flags.compress = B_TRUE; break; case 'w': flags.raw = B_TRUE; flags.compress = B_TRUE; flags.embed_data = B_TRUE; flags.largeblock = B_TRUE; break; case 'S': flags.saved = B_TRUE; break; case ':': /* * If a parameter was not passed, optopt contains the * value that would normally lead us into the * appropriate case statement. If it's > 256, then this * must be a longopt and we should look at argv to get * the string. Otherwise it's just the character, so we * should use it directly. */ if (optopt <= UINT8_MAX) { (void) fprintf(stderr, gettext("missing argument for '%c' " "option\n"), optopt); } else { (void) fprintf(stderr, gettext("missing argument for '%s' " "option\n"), argv[optind - 1]); } free(excludes.list); usage(B_FALSE); break; case '?': default: /* * If an invalid flag was passed, optopt contains the * character if it was a short flag, or 0 if it was a * longopt. */ if (optopt != 0) { (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } else { (void) fprintf(stderr, gettext("invalid option '%s'\n"), argv[optind - 1]); } free(excludes.list); usage(B_FALSE); } } if ((flags.parsable || flags.progressastitle) && flags.verbosity == 0) flags.verbosity = 1; if (excludes.count > 0 && !flags.replicate) { free(excludes.list); (void) fprintf(stderr, gettext("Cannot specify " "dataset exclusion (-X) on a non-recursive " "send.\n")); return (1); } argc -= optind; argv += optind; if (resume_token != NULL) { if (fromname != NULL || flags.replicate || flags.props || flags.backup || flags.holds || flags.saved || redactbook != NULL) { free(excludes.list); (void) fprintf(stderr, gettext("invalid flags combined with -t\n")); usage(B_FALSE); } if (argc > 0) { free(excludes.list); (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } else { if (argc < 1) { free(excludes.list); (void) fprintf(stderr, gettext("missing snapshot argument\n")); usage(B_FALSE); } if (argc > 1) { free(excludes.list); (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } if (flags.saved) { if (fromname != NULL || flags.replicate || flags.props || flags.doall || flags.backup || flags.holds || flags.largeblock || flags.embed_data || flags.compress || flags.raw || redactbook != NULL) { free(excludes.list); (void) fprintf(stderr, gettext("incompatible flags " "combined with saved send flag\n")); usage(B_FALSE); } if (strchr(argv[0], '@') != NULL) { free(excludes.list); (void) fprintf(stderr, gettext("saved send must " "specify the dataset with partially-received " "state\n")); usage(B_FALSE); } } if (flags.raw && redactbook != NULL) { free(excludes.list); (void) fprintf(stderr, gettext("Error: raw sends may not be redacted.\n")); return (1); } if (!flags.dryrun && isatty(STDOUT_FILENO)) { free(excludes.list); (void) fprintf(stderr, gettext("Error: Stream can not be written to a terminal.\n" "You must redirect standard output.\n")); return (1); } if (flags.saved) { zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); if (zhp == NULL) { free(excludes.list); return (1); } err = zfs_send_saved(zhp, &flags, STDOUT_FILENO, resume_token); free(excludes.list); zfs_close(zhp); return (err != 0); } else if (resume_token != NULL) { free(excludes.list); return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO, resume_token)); } if (flags.skipmissing && !flags.replicate) { free(excludes.list); (void) fprintf(stderr, gettext("skip-missing flag can only be used in " "conjunction with replicate\n")); usage(B_FALSE); } /* * For everything except -R and -I, use the new, cleaner code path. */ if (!(flags.replicate || flags.doall)) { char frombuf[ZFS_MAX_DATASET_NAME_LEN]; if (fromname != NULL && (strchr(fromname, '#') == NULL && strchr(fromname, '@') == NULL)) { /* * Neither bookmark or snapshot was specified. Print a * warning, and assume snapshot. */ (void) fprintf(stderr, "Warning: incremental source " "didn't specify type, assuming snapshot. Use '@' " "or '#' prefix to avoid ambiguity.\n"); (void) snprintf(frombuf, sizeof (frombuf), "@%s", fromname); fromname = frombuf; } if (fromname != NULL && (fromname[0] == '#' || fromname[0] == '@')) { /* * Incremental source name begins with # or @. * Default to same fs as target. */ char tmpbuf[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(tmpbuf, fromname, sizeof (tmpbuf)); (void) strlcpy(frombuf, argv[0], sizeof (frombuf)); cp = strchr(frombuf, '@'); if (cp != NULL) *cp = '\0'; (void) strlcat(frombuf, tmpbuf, sizeof (frombuf)); fromname = frombuf; } zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); if (zhp == NULL) { free(excludes.list); return (1); } err = zfs_send_one(zhp, fromname, STDOUT_FILENO, &flags, redactbook); free(excludes.list); zfs_close(zhp); return (err != 0); } if (fromname != NULL && strchr(fromname, '#')) { (void) fprintf(stderr, gettext("Error: multiple snapshots cannot be " "sent from a bookmark.\n")); free(excludes.list); return (1); } if (redactbook != NULL) { (void) fprintf(stderr, gettext("Error: multiple snapshots " "cannot be sent redacted.\n")); free(excludes.list); return (1); } if ((cp = strchr(argv[0], '@')) == NULL) { (void) fprintf(stderr, gettext("Error: " "Unsupported flag with filesystem or bookmark.\n")); free(excludes.list); return (1); } *cp = '\0'; toname = cp + 1; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { free(excludes.list); return (1); } /* * If they specified the full path to the snapshot, chop off * everything except the short name of the snapshot, but special * case if they specify the origin. */ if (fromname && (cp = strchr(fromname, '@')) != NULL) { char origin[ZFS_MAX_DATASET_NAME_LEN]; zprop_source_t src; (void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN, origin, sizeof (origin), &src, NULL, 0, B_FALSE); if (strcmp(origin, fromname) == 0) { fromname = NULL; flags.fromorigin = B_TRUE; } else { *cp = '\0'; if (cp != fromname && strcmp(argv[0], fromname)) { zfs_close(zhp); free(excludes.list); (void) fprintf(stderr, gettext("incremental source must be " "in same filesystem\n")); usage(B_FALSE); } fromname = cp + 1; if (strchr(fromname, '@') || strchr(fromname, '/')) { zfs_close(zhp); free(excludes.list); (void) fprintf(stderr, gettext("invalid incremental source\n")); usage(B_FALSE); } } } if (flags.replicate && fromname == NULL) flags.doall = B_TRUE; err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, excludes.count > 0 ? zfs_do_send_exclude : NULL, &excludes, flags.verbosity >= 3 ? &dbgnv : NULL); if (flags.verbosity >= 3 && dbgnv != NULL) { /* * dump_nvlist prints to stdout, but that's been * redirected to a file. Make it print to stderr * instead. */ (void) dup2(STDERR_FILENO, STDOUT_FILENO); dump_nvlist(dbgnv, 0); nvlist_free(dbgnv); } zfs_close(zhp); free(excludes.list); return (err != 0); } /* * Restore a backup stream from stdin. */ static int zfs_do_receive(int argc, char **argv) { int c, err = 0; recvflags_t flags = { 0 }; boolean_t abort_resumable = B_FALSE; nvlist_t *props; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, ":o:x:dehMnuvFsAc")) != -1) { switch (c) { case 'o': if (!parseprop(props, optarg)) { nvlist_free(props); usage(B_FALSE); } break; case 'x': if (!parsepropname(props, optarg)) { nvlist_free(props); usage(B_FALSE); } break; case 'd': if (flags.istail) { (void) fprintf(stderr, gettext("invalid option " "combination: -d and -e are mutually " "exclusive\n")); usage(B_FALSE); } flags.isprefix = B_TRUE; break; case 'e': if (flags.isprefix) { (void) fprintf(stderr, gettext("invalid option " "combination: -d and -e are mutually " "exclusive\n")); usage(B_FALSE); } flags.istail = B_TRUE; break; case 'h': flags.skipholds = B_TRUE; break; case 'M': flags.forceunmount = B_TRUE; break; case 'n': flags.dryrun = B_TRUE; break; case 'u': flags.nomount = B_TRUE; break; case 'v': flags.verbose = B_TRUE; break; case 's': flags.resumable = B_TRUE; break; case 'F': flags.force = B_TRUE; break; case 'A': abort_resumable = B_TRUE; break; case 'c': flags.heal = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* zfs recv -e (use "tail" name) implies -d (remove dataset "head") */ if (flags.istail) flags.isprefix = B_TRUE; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing snapshot argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (abort_resumable) { if (flags.isprefix || flags.istail || flags.dryrun || flags.resumable || flags.nomount) { (void) fprintf(stderr, gettext("invalid option\n")); usage(B_FALSE); } char namebuf[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(namebuf, sizeof (namebuf), "%s/%%recv", argv[0]); if (zfs_dataset_exists(g_zfs, namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { zfs_handle_t *zhp = zfs_open(g_zfs, namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { nvlist_free(props); return (1); } err = zfs_destroy(zhp, B_FALSE); zfs_close(zhp); } else { zfs_handle_t *zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) usage(B_FALSE); if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) || zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0, NULL, NULL, 0, B_TRUE) == -1) { (void) fprintf(stderr, gettext("'%s' does not have any " "resumable receive state to abort\n"), argv[0]); nvlist_free(props); zfs_close(zhp); return (1); } err = zfs_destroy(zhp, B_FALSE); zfs_close(zhp); } nvlist_free(props); return (err != 0); } if (isatty(STDIN_FILENO)) { (void) fprintf(stderr, gettext("Error: Backup stream can not be read " "from a terminal.\n" "You must redirect standard input.\n")); nvlist_free(props); return (1); } err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL); nvlist_free(props); return (err != 0); } /* * allow/unallow stuff */ /* copied from zfs/sys/dsl_deleg.h */ #define ZFS_DELEG_PERM_CREATE "create" #define ZFS_DELEG_PERM_DESTROY "destroy" #define ZFS_DELEG_PERM_SNAPSHOT "snapshot" #define ZFS_DELEG_PERM_ROLLBACK "rollback" #define ZFS_DELEG_PERM_CLONE "clone" #define ZFS_DELEG_PERM_PROMOTE "promote" #define ZFS_DELEG_PERM_RENAME "rename" #define ZFS_DELEG_PERM_MOUNT "mount" #define ZFS_DELEG_PERM_SHARE "share" #define ZFS_DELEG_PERM_SEND "send" #define ZFS_DELEG_PERM_SEND_RAW "send:raw" #define ZFS_DELEG_PERM_RECEIVE "receive" #define ZFS_DELEG_PERM_RECEIVE_APPEND "receive:append" #define ZFS_DELEG_PERM_ALLOW "allow" #define ZFS_DELEG_PERM_USERPROP "userprop" #define ZFS_DELEG_PERM_VSCAN "vscan" /* ??? */ #define ZFS_DELEG_PERM_USERQUOTA "userquota" #define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" #define ZFS_DELEG_PERM_USERUSED "userused" #define ZFS_DELEG_PERM_GROUPUSED "groupused" #define ZFS_DELEG_PERM_USEROBJQUOTA "userobjquota" #define ZFS_DELEG_PERM_GROUPOBJQUOTA "groupobjquota" #define ZFS_DELEG_PERM_USEROBJUSED "userobjused" #define ZFS_DELEG_PERM_GROUPOBJUSED "groupobjused" #define ZFS_DELEG_PERM_HOLD "hold" #define ZFS_DELEG_PERM_RELEASE "release" #define ZFS_DELEG_PERM_DIFF "diff" #define ZFS_DELEG_PERM_BOOKMARK "bookmark" #define ZFS_DELEG_PERM_LOAD_KEY "load-key" #define ZFS_DELEG_PERM_CHANGE_KEY "change-key" #define ZFS_DELEG_PERM_PROJECTUSED "projectused" #define ZFS_DELEG_PERM_PROJECTQUOTA "projectquota" #define ZFS_DELEG_PERM_PROJECTOBJUSED "projectobjused" #define ZFS_DELEG_PERM_PROJECTOBJQUOTA "projectobjquota" #define ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = { { ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW }, { ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE }, { ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE }, { ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY }, { ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF}, { ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD }, { ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT }, { ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE }, { ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE }, { ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE }, { ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME }, { ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK }, { ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND }, { ZFS_DELEG_PERM_SEND_RAW, ZFS_DELEG_NOTE_SEND_RAW }, { ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, { ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, { ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK }, { ZFS_DELEG_PERM_LOAD_KEY, ZFS_DELEG_NOTE_LOAD_KEY }, { ZFS_DELEG_PERM_CHANGE_KEY, ZFS_DELEG_NOTE_CHANGE_KEY }, { ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA }, { ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED }, { ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP }, { ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA }, { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED }, { ZFS_DELEG_PERM_USEROBJQUOTA, ZFS_DELEG_NOTE_USEROBJQUOTA }, { ZFS_DELEG_PERM_USEROBJUSED, ZFS_DELEG_NOTE_USEROBJUSED }, { ZFS_DELEG_PERM_GROUPOBJQUOTA, ZFS_DELEG_NOTE_GROUPOBJQUOTA }, { ZFS_DELEG_PERM_GROUPOBJUSED, ZFS_DELEG_NOTE_GROUPOBJUSED }, { ZFS_DELEG_PERM_PROJECTUSED, ZFS_DELEG_NOTE_PROJECTUSED }, { ZFS_DELEG_PERM_PROJECTQUOTA, ZFS_DELEG_NOTE_PROJECTQUOTA }, { ZFS_DELEG_PERM_PROJECTOBJUSED, ZFS_DELEG_NOTE_PROJECTOBJUSED }, { ZFS_DELEG_PERM_PROJECTOBJQUOTA, ZFS_DELEG_NOTE_PROJECTOBJQUOTA }, { NULL, ZFS_DELEG_NOTE_NONE } }; /* permission structure */ typedef struct deleg_perm { zfs_deleg_who_type_t dp_who_type; const char *dp_name; boolean_t dp_local; boolean_t dp_descend; } deleg_perm_t; /* */ typedef struct deleg_perm_node { deleg_perm_t dpn_perm; uu_avl_node_t dpn_avl_node; } deleg_perm_node_t; typedef struct fs_perm fs_perm_t; /* permissions set */ typedef struct who_perm { zfs_deleg_who_type_t who_type; const char *who_name; /* id */ char who_ug_name[256]; /* user/group name */ fs_perm_t *who_fsperm; /* uplink */ uu_avl_t *who_deleg_perm_avl; /* permissions */ } who_perm_t; /* */ typedef struct who_perm_node { who_perm_t who_perm; uu_avl_node_t who_avl_node; } who_perm_node_t; typedef struct fs_perm_set fs_perm_set_t; /* fs permissions */ struct fs_perm { const char *fsp_name; uu_avl_t *fsp_sc_avl; /* sets,create */ uu_avl_t *fsp_uge_avl; /* user,group,everyone */ fs_perm_set_t *fsp_set; /* uplink */ }; /* */ typedef struct fs_perm_node { fs_perm_t fspn_fsperm; uu_avl_t *fspn_avl; uu_list_node_t fspn_list_node; } fs_perm_node_t; /* top level structure */ struct fs_perm_set { uu_list_pool_t *fsps_list_pool; uu_list_t *fsps_list; /* list of fs_perms */ uu_avl_pool_t *fsps_named_set_avl_pool; uu_avl_pool_t *fsps_who_perm_avl_pool; uu_avl_pool_t *fsps_deleg_perm_avl_pool; }; static inline const char * deleg_perm_type(zfs_deleg_note_t note) { /* subcommands */ switch (note) { /* SUBCOMMANDS */ /* OTHER */ case ZFS_DELEG_NOTE_GROUPQUOTA: case ZFS_DELEG_NOTE_GROUPUSED: case ZFS_DELEG_NOTE_USERPROP: case ZFS_DELEG_NOTE_USERQUOTA: case ZFS_DELEG_NOTE_USERUSED: case ZFS_DELEG_NOTE_USEROBJQUOTA: case ZFS_DELEG_NOTE_USEROBJUSED: case ZFS_DELEG_NOTE_GROUPOBJQUOTA: case ZFS_DELEG_NOTE_GROUPOBJUSED: case ZFS_DELEG_NOTE_PROJECTUSED: case ZFS_DELEG_NOTE_PROJECTQUOTA: case ZFS_DELEG_NOTE_PROJECTOBJUSED: case ZFS_DELEG_NOTE_PROJECTOBJQUOTA: /* other */ return (gettext("other")); default: return (gettext("subcommand")); } } static int who_type2weight(zfs_deleg_who_type_t who_type) { int res; switch (who_type) { case ZFS_DELEG_NAMED_SET_SETS: case ZFS_DELEG_NAMED_SET: res = 0; break; case ZFS_DELEG_CREATE_SETS: case ZFS_DELEG_CREATE: res = 1; break; case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: res = 2; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: res = 3; break; case ZFS_DELEG_EVERYONE_SETS: case ZFS_DELEG_EVERYONE: res = 4; break; default: res = -1; } return (res); } static int who_perm_compare(const void *larg, const void *rarg, void *unused) { (void) unused; const who_perm_node_t *l = larg; const who_perm_node_t *r = rarg; zfs_deleg_who_type_t ltype = l->who_perm.who_type; zfs_deleg_who_type_t rtype = r->who_perm.who_type; int lweight = who_type2weight(ltype); int rweight = who_type2weight(rtype); int res = lweight - rweight; if (res == 0) res = strncmp(l->who_perm.who_name, r->who_perm.who_name, ZFS_MAX_DELEG_NAME-1); if (res == 0) return (0); if (res > 0) return (1); else return (-1); } static int deleg_perm_compare(const void *larg, const void *rarg, void *unused) { (void) unused; const deleg_perm_node_t *l = larg; const deleg_perm_node_t *r = rarg; int res = strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name, ZFS_MAX_DELEG_NAME-1); if (res == 0) return (0); if (res > 0) return (1); else return (-1); } static inline void fs_perm_set_init(fs_perm_set_t *fspset) { memset(fspset, 0, sizeof (fs_perm_set_t)); if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool", sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node), NULL, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create( "named_set_avl_pool", sizeof (who_perm_node_t), offsetof( who_perm_node_t, who_avl_node), who_perm_compare, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create( "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof( who_perm_node_t, who_avl_node), who_perm_compare, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create( "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof( deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT)) == NULL) nomem(); } static inline void fs_perm_fini(fs_perm_t *); static inline void who_perm_fini(who_perm_t *); static inline void fs_perm_set_fini(fs_perm_set_t *fspset) { fs_perm_node_t *node = uu_list_first(fspset->fsps_list); while (node != NULL) { fs_perm_node_t *next_node = uu_list_next(fspset->fsps_list, node); fs_perm_t *fsperm = &node->fspn_fsperm; fs_perm_fini(fsperm); uu_list_remove(fspset->fsps_list, node); free(node); node = next_node; } uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool); uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool); uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool); } static inline void deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type, const char *name) { deleg_perm->dp_who_type = type; deleg_perm->dp_name = name; } static inline void who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm, zfs_deleg_who_type_t type, const char *name) { uu_avl_pool_t *pool; pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool; memset(who_perm, 0, sizeof (who_perm_t)); if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL) nomem(); who_perm->who_type = type; who_perm->who_name = name; who_perm->who_fsperm = fsperm; } static inline void who_perm_fini(who_perm_t *who_perm) { deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl); while (node != NULL) { deleg_perm_node_t *next_node = uu_avl_next(who_perm->who_deleg_perm_avl, node); uu_avl_remove(who_perm->who_deleg_perm_avl, node); free(node); node = next_node; } uu_avl_destroy(who_perm->who_deleg_perm_avl); } static inline void fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname) { uu_avl_pool_t *nset_pool = fspset->fsps_named_set_avl_pool; uu_avl_pool_t *who_pool = fspset->fsps_who_perm_avl_pool; memset(fsperm, 0, sizeof (fs_perm_t)); if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT)) == NULL) nomem(); if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT)) == NULL) nomem(); fsperm->fsp_set = fspset; fsperm->fsp_name = fsname; } static inline void fs_perm_fini(fs_perm_t *fsperm) { who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl); while (node != NULL) { who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl, node); who_perm_t *who_perm = &node->who_perm; who_perm_fini(who_perm); uu_avl_remove(fsperm->fsp_sc_avl, node); free(node); node = next_node; } node = uu_avl_first(fsperm->fsp_uge_avl); while (node != NULL) { who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl, node); who_perm_t *who_perm = &node->who_perm; who_perm_fini(who_perm); uu_avl_remove(fsperm->fsp_uge_avl, node); free(node); node = next_node; } uu_avl_destroy(fsperm->fsp_sc_avl); uu_avl_destroy(fsperm->fsp_uge_avl); } static void set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node, zfs_deleg_who_type_t who_type, const char *name, char locality) { uu_avl_index_t idx = 0; deleg_perm_node_t *found_node = NULL; deleg_perm_t *deleg_perm = &node->dpn_perm; deleg_perm_init(deleg_perm, who_type, name); if ((found_node = uu_avl_find(avl, node, NULL, &idx)) == NULL) uu_avl_insert(avl, node, idx); else { node = found_node; deleg_perm = &node->dpn_perm; } switch (locality) { case ZFS_DELEG_LOCAL: deleg_perm->dp_local = B_TRUE; break; case ZFS_DELEG_DESCENDENT: deleg_perm->dp_descend = B_TRUE; break; case ZFS_DELEG_NA: break; default: assert(B_FALSE); /* invalid locality */ } } static inline int parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality) { nvpair_t *nvp = NULL; fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set; uu_avl_t *avl = who_perm->who_deleg_perm_avl; zfs_deleg_who_type_t who_type = who_perm->who_type; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { const char *name = nvpair_name(nvp); data_type_t type = nvpair_type(nvp); uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool; deleg_perm_node_t *node = safe_malloc(sizeof (deleg_perm_node_t)); VERIFY(type == DATA_TYPE_BOOLEAN); uu_avl_node_init(node, &node->dpn_avl_node, avl_pool); set_deleg_perm_node(avl, node, who_type, name, locality); } return (0); } static inline int parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl) { nvpair_t *nvp = NULL; fs_perm_set_t *fspset = fsperm->fsp_set; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { nvlist_t *nvl2 = NULL; const char *name = nvpair_name(nvp); uu_avl_t *avl = NULL; uu_avl_pool_t *avl_pool = NULL; zfs_deleg_who_type_t perm_type = name[0]; char perm_locality = name[1]; const char *perm_name = name + 3; who_perm_t *who_perm = NULL; assert('$' == name[2]); if (nvpair_value_nvlist(nvp, &nvl2) != 0) return (-1); switch (perm_type) { case ZFS_DELEG_CREATE: case ZFS_DELEG_CREATE_SETS: case ZFS_DELEG_NAMED_SET: case ZFS_DELEG_NAMED_SET_SETS: avl_pool = fspset->fsps_named_set_avl_pool; avl = fsperm->fsp_sc_avl; break; case ZFS_DELEG_USER: case ZFS_DELEG_USER_SETS: case ZFS_DELEG_GROUP: case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_EVERYONE: case ZFS_DELEG_EVERYONE_SETS: avl_pool = fspset->fsps_who_perm_avl_pool; avl = fsperm->fsp_uge_avl; break; default: assert(!"unhandled zfs_deleg_who_type_t"); } who_perm_node_t *found_node = NULL; who_perm_node_t *node = safe_malloc( sizeof (who_perm_node_t)); who_perm = &node->who_perm; uu_avl_index_t idx = 0; uu_avl_node_init(node, &node->who_avl_node, avl_pool); who_perm_init(who_perm, fsperm, perm_type, perm_name); if ((found_node = uu_avl_find(avl, node, NULL, &idx)) == NULL) { if (avl == fsperm->fsp_uge_avl) { uid_t rid = 0; struct passwd *p = NULL; struct group *g = NULL; const char *nice_name = NULL; switch (perm_type) { case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: rid = atoi(perm_name); p = getpwuid(rid); if (p) nice_name = p->pw_name; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: rid = atoi(perm_name); g = getgrgid(rid); if (g) nice_name = g->gr_name; break; default: break; } if (nice_name != NULL) { (void) strlcpy( node->who_perm.who_ug_name, nice_name, 256); } else { /* User or group unknown */ (void) snprintf( node->who_perm.who_ug_name, sizeof (node->who_perm.who_ug_name), "(unknown: %d)", rid); } } uu_avl_insert(avl, node, idx); } else { node = found_node; who_perm = &node->who_perm; } assert(who_perm != NULL); (void) parse_who_perm(who_perm, nvl2, perm_locality); } return (0); } static inline int parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl) { nvpair_t *nvp = NULL; uu_avl_index_t idx = 0; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { nvlist_t *nvl2 = NULL; const char *fsname = nvpair_name(nvp); data_type_t type = nvpair_type(nvp); fs_perm_t *fsperm = NULL; fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t)); fsperm = &node->fspn_fsperm; VERIFY(DATA_TYPE_NVLIST == type); uu_list_node_init(node, &node->fspn_list_node, fspset->fsps_list_pool); idx = uu_list_numnodes(fspset->fsps_list); fs_perm_init(fsperm, fspset, fsname); if (nvpair_value_nvlist(nvp, &nvl2) != 0) return (-1); (void) parse_fs_perm(fsperm, nvl2); uu_list_insert(fspset->fsps_list, node, idx); } return (0); } static inline const char * deleg_perm_comment(zfs_deleg_note_t note) { const char *str; /* subcommands */ switch (note) { /* SUBCOMMANDS */ case ZFS_DELEG_NOTE_ALLOW: str = gettext("Must also have the permission that is being" "\n\t\t\t\tallowed"); break; case ZFS_DELEG_NOTE_CLONE: str = gettext("Must also have the 'create' ability and 'mount'" "\n\t\t\t\tability in the origin file system"); break; case ZFS_DELEG_NOTE_CREATE: str = gettext("Must also have the 'mount' ability"); break; case ZFS_DELEG_NOTE_DESTROY: str = gettext("Must also have the 'mount' ability"); break; case ZFS_DELEG_NOTE_DIFF: str = gettext("Allows lookup of paths within a dataset;" "\n\t\t\t\tgiven an object number. Ordinary users need this" "\n\t\t\t\tin order to use zfs diff"); break; case ZFS_DELEG_NOTE_HOLD: str = gettext("Allows adding a user hold to a snapshot"); break; case ZFS_DELEG_NOTE_MOUNT: str = gettext("Allows mount/umount of ZFS datasets"); break; case ZFS_DELEG_NOTE_PROMOTE: str = gettext("Must also have the 'mount'\n\t\t\t\tand" " 'promote' ability in the origin file system"); break; case ZFS_DELEG_NOTE_RECEIVE: str = gettext("Must also have the 'mount' and 'create'" " ability"); break; case ZFS_DELEG_NOTE_RELEASE: str = gettext("Allows releasing a user hold which\n\t\t\t\t" "might destroy the snapshot"); break; case ZFS_DELEG_NOTE_RENAME: str = gettext("Must also have the 'mount' and 'create'" "\n\t\t\t\tability in the new parent"); break; case ZFS_DELEG_NOTE_ROLLBACK: str = gettext(""); break; case ZFS_DELEG_NOTE_SEND: str = gettext(""); break; case ZFS_DELEG_NOTE_SEND_RAW: str = gettext("Allow sending ONLY encrypted (raw) replication" "\n\t\t\t\tstreams"); break; case ZFS_DELEG_NOTE_SHARE: str = gettext("Allows sharing file systems over NFS or SMB" "\n\t\t\t\tprotocols"); break; case ZFS_DELEG_NOTE_SNAPSHOT: str = gettext(""); break; case ZFS_DELEG_NOTE_LOAD_KEY: str = gettext("Allows loading or unloading an encryption key"); break; case ZFS_DELEG_NOTE_CHANGE_KEY: str = gettext("Allows changing or adding an encryption key"); break; /* * case ZFS_DELEG_NOTE_VSCAN: * str = gettext(""); * break; */ /* OTHER */ case ZFS_DELEG_NOTE_GROUPQUOTA: str = gettext("Allows accessing any groupquota@... property"); break; case ZFS_DELEG_NOTE_GROUPUSED: str = gettext("Allows reading any groupused@... property"); break; case ZFS_DELEG_NOTE_USERPROP: str = gettext("Allows changing any user property"); break; case ZFS_DELEG_NOTE_USERQUOTA: str = gettext("Allows accessing any userquota@... property"); break; case ZFS_DELEG_NOTE_USERUSED: str = gettext("Allows reading any userused@... property"); break; case ZFS_DELEG_NOTE_USEROBJQUOTA: str = gettext("Allows accessing any userobjquota@... property"); break; case ZFS_DELEG_NOTE_GROUPOBJQUOTA: str = gettext("Allows accessing any \n\t\t\t\t" "groupobjquota@... property"); break; case ZFS_DELEG_NOTE_GROUPOBJUSED: str = gettext("Allows reading any groupobjused@... property"); break; case ZFS_DELEG_NOTE_USEROBJUSED: str = gettext("Allows reading any userobjused@... property"); break; case ZFS_DELEG_NOTE_PROJECTQUOTA: str = gettext("Allows accessing any projectquota@... property"); break; case ZFS_DELEG_NOTE_PROJECTOBJQUOTA: str = gettext("Allows accessing any \n\t\t\t\t" "projectobjquota@... property"); break; case ZFS_DELEG_NOTE_PROJECTUSED: str = gettext("Allows reading any projectused@... property"); break; case ZFS_DELEG_NOTE_PROJECTOBJUSED: str = gettext("Allows accessing any \n\t\t\t\t" "projectobjused@... property"); break; /* other */ default: str = ""; } return (str); } struct allow_opts { boolean_t local; boolean_t descend; boolean_t user; boolean_t group; boolean_t everyone; boolean_t create; boolean_t set; boolean_t recursive; /* unallow only */ boolean_t prt_usage; boolean_t prt_perms; char *who; char *perms; const char *dataset; }; static inline int prop_cmp(const void *a, const void *b) { const char *str1 = *(const char **)a; const char *str2 = *(const char **)b; return (strcmp(str1, str2)); } static void allow_usage(boolean_t un, boolean_t requested, const char *msg) { const char *opt_desc[] = { "-h", gettext("show this help message and exit"), "-l", gettext("set permission locally"), "-d", gettext("set permission for descents"), "-u", gettext("set permission for user"), "-g", gettext("set permission for group"), "-e", gettext("set permission for everyone"), "-c", gettext("set create time permission"), "-s", gettext("define permission set"), /* unallow only */ "-r", gettext("remove permissions recursively"), }; size_t unallow_size = sizeof (opt_desc) / sizeof (char *); size_t allow_size = unallow_size - 2; const char *props[ZFS_NUM_PROPS]; int i; size_t count = 0; FILE *fp = requested ? stdout : stderr; zprop_desc_t *pdtbl = zfs_prop_get_table(); const char *fmt = gettext("%-16s %-14s\t%s\n"); (void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW : HELP_ALLOW)); (void) fprintf(fp, gettext("Options:\n")); for (i = 0; i < (un ? unallow_size : allow_size); i += 2) { const char *opt = opt_desc[i]; const char *optdsc = opt_desc[i + 1]; (void) fprintf(fp, gettext(" %-10s %s\n"), opt, optdsc); } (void) fprintf(fp, gettext("\nThe following permissions are " "supported:\n\n")); (void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"), gettext("NOTES")); for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) { const char *perm_name = zfs_deleg_perm_tbl[i].z_perm; zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note; const char *perm_type = deleg_perm_type(perm_note); const char *perm_comment = deleg_perm_comment(perm_note); (void) fprintf(fp, fmt, perm_name, perm_type, perm_comment); } for (i = 0; i < ZFS_NUM_PROPS; i++) { zprop_desc_t *pd = &pdtbl[i]; if (pd->pd_visible != B_TRUE) continue; if (pd->pd_attr == PROP_READONLY) continue; props[count++] = pd->pd_name; } props[count] = NULL; qsort(props, count, sizeof (char *), prop_cmp); for (i = 0; i < count; i++) (void) fprintf(fp, fmt, props[i], gettext("property"), ""); if (msg != NULL) (void) fprintf(fp, gettext("\nzfs: error: %s"), msg); exit(requested ? 0 : 2); } static inline const char * munge_args(int argc, char **argv, boolean_t un, size_t expected_argc, char **permsp) { if (un && argc == expected_argc - 1) *permsp = NULL; else if (argc == expected_argc) *permsp = argv[argc - 2]; else allow_usage(un, B_FALSE, gettext("wrong number of parameters\n")); return (argv[argc - 1]); } static void parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts) { int uge_sum = opts->user + opts->group + opts->everyone; int csuge_sum = opts->create + opts->set + uge_sum; int ldcsuge_sum = csuge_sum + opts->local + opts->descend; int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum; if (uge_sum > 1) allow_usage(un, B_FALSE, gettext("-u, -g, and -e are mutually exclusive\n")); if (opts->prt_usage) { if (argc == 0 && all_sum == 0) allow_usage(un, B_TRUE, NULL); else usage(B_FALSE); } if (opts->set) { if (csuge_sum > 1) allow_usage(un, B_FALSE, gettext("invalid options combined with -s\n")); opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); if (argv[0][0] != '@') allow_usage(un, B_FALSE, gettext("invalid set name: missing '@' prefix\n")); opts->who = argv[0]; } else if (opts->create) { if (ldcsuge_sum > 1) allow_usage(un, B_FALSE, gettext("invalid options combined with -c\n")); opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); } else if (opts->everyone) { if (csuge_sum > 1) allow_usage(un, B_FALSE, gettext("invalid options combined with -e\n")); opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); } else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone") == 0) { opts->everyone = B_TRUE; argc--; argv++; opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); } else if (argc == 1 && !un) { opts->prt_perms = B_TRUE; opts->dataset = argv[argc-1]; } else { opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); opts->who = argv[0]; } if (!opts->local && !opts->descend) { opts->local = B_TRUE; opts->descend = B_TRUE; } } static void store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend, const char *who, char *perms, nvlist_t *top_nvl) { int i; char ld[2] = { '\0', '\0' }; char who_buf[MAXNAMELEN + 32]; char base_type = '\0'; char set_type = '\0'; nvlist_t *base_nvl = NULL; nvlist_t *set_nvl = NULL; nvlist_t *nvl; if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); switch (type) { case ZFS_DELEG_NAMED_SET_SETS: case ZFS_DELEG_NAMED_SET: set_type = ZFS_DELEG_NAMED_SET_SETS; base_type = ZFS_DELEG_NAMED_SET; ld[0] = ZFS_DELEG_NA; break; case ZFS_DELEG_CREATE_SETS: case ZFS_DELEG_CREATE: set_type = ZFS_DELEG_CREATE_SETS; base_type = ZFS_DELEG_CREATE; ld[0] = ZFS_DELEG_NA; break; case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: set_type = ZFS_DELEG_USER_SETS; base_type = ZFS_DELEG_USER; if (local) ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: set_type = ZFS_DELEG_GROUP_SETS; base_type = ZFS_DELEG_GROUP; if (local) ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; break; case ZFS_DELEG_EVERYONE_SETS: case ZFS_DELEG_EVERYONE: set_type = ZFS_DELEG_EVERYONE_SETS; base_type = ZFS_DELEG_EVERYONE; if (local) ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; break; default: assert(set_type != '\0' && base_type != '\0'); } if (perms != NULL) { char *curr = perms; char *end = curr + strlen(perms); while (curr < end) { char *delim = strchr(curr, ','); if (delim == NULL) delim = end; else *delim = '\0'; if (curr[0] == '@') nvl = set_nvl; else nvl = base_nvl; (void) nvlist_add_boolean(nvl, curr); if (delim != end) *delim = ','; curr = delim + 1; } for (i = 0; i < 2; i++) { char locality = ld[i]; if (locality == 0) continue; if (!nvlist_empty(base_nvl)) { if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", base_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", base_type, locality); (void) nvlist_add_nvlist(top_nvl, who_buf, base_nvl); } if (!nvlist_empty(set_nvl)) { if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", set_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", set_type, locality); (void) nvlist_add_nvlist(top_nvl, who_buf, set_nvl); } } } else { for (i = 0; i < 2; i++) { char locality = ld[i]; if (locality == 0) continue; if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", base_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", base_type, locality); (void) nvlist_add_boolean(top_nvl, who_buf); if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", set_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", set_type, locality); (void) nvlist_add_boolean(top_nvl, who_buf); } } } static int construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) { if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0) nomem(); if (opts->set) { store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local, opts->descend, opts->who, opts->perms, *nvlp); } else if (opts->create) { store_allow_perm(ZFS_DELEG_CREATE, opts->local, opts->descend, NULL, opts->perms, *nvlp); } else if (opts->everyone) { store_allow_perm(ZFS_DELEG_EVERYONE, opts->local, opts->descend, NULL, opts->perms, *nvlp); } else { char *curr = opts->who; char *end = curr + strlen(curr); while (curr < end) { const char *who; zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN; char *endch; char *delim = strchr(curr, ','); char errbuf[256]; char id[64]; struct passwd *p = NULL; struct group *g = NULL; uid_t rid; if (delim == NULL) delim = end; else *delim = '\0'; rid = (uid_t)strtol(curr, &endch, 0); if (opts->user) { who_type = ZFS_DELEG_USER; if (*endch != '\0') p = getpwnam(curr); else p = getpwuid(rid); if (p != NULL) rid = p->pw_uid; else if (*endch != '\0') { (void) snprintf(errbuf, sizeof (errbuf), gettext("invalid user %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } else if (opts->group) { who_type = ZFS_DELEG_GROUP; if (*endch != '\0') g = getgrnam(curr); else g = getgrgid(rid); if (g != NULL) rid = g->gr_gid; else if (*endch != '\0') { (void) snprintf(errbuf, sizeof (errbuf), gettext("invalid group %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } else { if (*endch != '\0') { p = getpwnam(curr); } else { p = getpwuid(rid); } if (p == NULL) { if (*endch != '\0') { g = getgrnam(curr); } else { g = getgrgid(rid); } } if (p != NULL) { who_type = ZFS_DELEG_USER; rid = p->pw_uid; } else if (g != NULL) { who_type = ZFS_DELEG_GROUP; rid = g->gr_gid; } else { (void) snprintf(errbuf, sizeof (errbuf), gettext("invalid user/group %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } (void) sprintf(id, "%u", rid); who = id; store_allow_perm(who_type, opts->local, opts->descend, who, opts->perms, *nvlp); curr = delim + 1; } } return (0); } static void print_set_creat_perms(uu_avl_t *who_avl) { const char *sc_title[] = { gettext("Permission sets:\n"), gettext("Create time permissions:\n"), NULL }; who_perm_node_t *who_node = NULL; int prev_weight = -1; for (who_node = uu_avl_first(who_avl); who_node != NULL; who_node = uu_avl_next(who_avl, who_node)) { uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; const char *who_name = who_node->who_perm.who_name; int weight = who_type2weight(who_type); boolean_t first = B_TRUE; deleg_perm_node_t *deleg_node; if (prev_weight != weight) { (void) printf("%s", sc_title[weight]); prev_weight = weight; } if (who_name == NULL || strnlen(who_name, 1) == 0) (void) printf("\t"); else (void) printf("\t%s ", who_name); for (deleg_node = uu_avl_first(avl); deleg_node != NULL; deleg_node = uu_avl_next(avl, deleg_node)) { if (first) { (void) printf("%s", deleg_node->dpn_perm.dp_name); first = B_FALSE; } else (void) printf(",%s", deleg_node->dpn_perm.dp_name); } (void) printf("\n"); } } static void print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend, const char *title) { who_perm_node_t *who_node = NULL; boolean_t prt_title = B_TRUE; uu_avl_walk_t *walk; if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL) nomem(); while ((who_node = uu_avl_walk_next(walk)) != NULL) { const char *who_name = who_node->who_perm.who_name; const char *nice_who_name = who_node->who_perm.who_ug_name; uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; char delim = ' '; deleg_perm_node_t *deleg_node; boolean_t prt_who = B_TRUE; for (deleg_node = uu_avl_first(avl); deleg_node != NULL; deleg_node = uu_avl_next(avl, deleg_node)) { if (local != deleg_node->dpn_perm.dp_local || descend != deleg_node->dpn_perm.dp_descend) continue; if (prt_who) { const char *who = NULL; if (prt_title) { prt_title = B_FALSE; (void) printf("%s", title); } switch (who_type) { case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: who = gettext("user"); if (nice_who_name) who_name = nice_who_name; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: who = gettext("group"); if (nice_who_name) who_name = nice_who_name; break; case ZFS_DELEG_EVERYONE_SETS: case ZFS_DELEG_EVERYONE: who = gettext("everyone"); who_name = NULL; break; default: assert(who != NULL); } prt_who = B_FALSE; if (who_name == NULL) (void) printf("\t%s", who); else (void) printf("\t%s %s", who, who_name); } (void) printf("%c%s", delim, deleg_node->dpn_perm.dp_name); delim = ','; } if (!prt_who) (void) printf("\n"); } uu_avl_walk_end(walk); } static void print_fs_perms(fs_perm_set_t *fspset) { fs_perm_node_t *node = NULL; char buf[MAXNAMELEN + 32]; const char *dsname = buf; for (node = uu_list_first(fspset->fsps_list); node != NULL; node = uu_list_next(fspset->fsps_list, node)) { uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl; uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl; int left = 0; (void) snprintf(buf, sizeof (buf), gettext("---- Permissions on %s "), node->fspn_fsperm.fsp_name); (void) printf("%s", dsname); left = 70 - strlen(buf); while (left-- > 0) (void) printf("-"); (void) printf("\n"); print_set_creat_perms(sc_avl); print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE, gettext("Local permissions:\n")); print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE, gettext("Descendent permissions:\n")); print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE, gettext("Local+Descendent permissions:\n")); } } static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL }; struct deleg_perms { boolean_t un; nvlist_t *nvl; }; static int set_deleg_perms(zfs_handle_t *zhp, void *data) { struct deleg_perms *perms = (struct deleg_perms *)data; zfs_type_t zfs_type = zfs_get_type(zhp); if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME) return (0); return (zfs_set_fsacl(zhp, perms->un, perms->nvl)); } static int zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un) { zfs_handle_t *zhp; nvlist_t *perm_nvl = NULL; nvlist_t *update_perm_nvl = NULL; int error = 1; int c; struct allow_opts opts = { 0 }; const char *optstr = un ? "ldugecsrh" : "ldugecsh"; /* check opts */ while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'l': opts.local = B_TRUE; break; case 'd': opts.descend = B_TRUE; break; case 'u': opts.user = B_TRUE; break; case 'g': opts.group = B_TRUE; break; case 'e': opts.everyone = B_TRUE; break; case 's': opts.set = B_TRUE; break; case 'c': opts.create = B_TRUE; break; case 'r': opts.recursive = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case 'h': opts.prt_usage = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check arguments */ parse_allow_args(argc, argv, un, &opts); /* try to open the dataset */ if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { (void) fprintf(stderr, "Failed to open dataset: %s\n", opts.dataset); return (-1); } if (zfs_get_fsacl(zhp, &perm_nvl) != 0) goto cleanup2; fs_perm_set_init(&fs_perm_set); if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) { (void) fprintf(stderr, "Failed to parse fsacl permissions\n"); goto cleanup1; } if (opts.prt_perms) print_fs_perms(&fs_perm_set); else { (void) construct_fsacl_list(un, &opts, &update_perm_nvl); if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0) goto cleanup0; if (un && opts.recursive) { struct deleg_perms data = { un, update_perm_nvl }; if (zfs_iter_filesystems_v2(zhp, 0, set_deleg_perms, &data) != 0) goto cleanup0; } } error = 0; cleanup0: nvlist_free(perm_nvl); nvlist_free(update_perm_nvl); cleanup1: fs_perm_set_fini(&fs_perm_set); cleanup2: zfs_close(zhp); return (error); } static int zfs_do_allow(int argc, char **argv) { return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE)); } static int zfs_do_unallow(int argc, char **argv) { return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE)); } static int zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) { int errors = 0; int i; const char *tag; boolean_t recursive = B_FALSE; const char *opts = holding ? "rt" : "r"; int c; /* check options */ while ((c = getopt(argc, argv, opts)) != -1) { switch (c) { case 'r': recursive = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 2) usage(B_FALSE); tag = argv[0]; --argc; ++argv; if (holding && tag[0] == '.') { /* tags starting with '.' are reserved for libzfs */ (void) fprintf(stderr, gettext("tag may not start with '.'\n")); usage(B_FALSE); } for (i = 0; i < argc; ++i) { zfs_handle_t *zhp; char parent[ZFS_MAX_DATASET_NAME_LEN]; const char *delim; char *path = argv[i]; delim = strchr(path, '@'); if (delim == NULL) { (void) fprintf(stderr, gettext("'%s' is not a snapshot\n"), path); ++errors; continue; } (void) strlcpy(parent, path, MIN(sizeof (parent), delim - path + 1)); zhp = zfs_open(g_zfs, parent, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { ++errors; continue; } if (holding) { if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0) ++errors; } else { if (zfs_release(zhp, delim+1, tag, recursive) != 0) ++errors; } zfs_close(zhp); } return (errors != 0); } /* * zfs hold [-r] [-t] ... * * -r Recursively hold * * Apply a user-hold with the given tag to the list of snapshots. */ static int zfs_do_hold(int argc, char **argv) { return (zfs_do_hold_rele_impl(argc, argv, B_TRUE)); } /* * zfs release [-r] ... * * -r Recursively release * * Release a user-hold with the given tag from the list of snapshots. */ static int zfs_do_release(int argc, char **argv) { return (zfs_do_hold_rele_impl(argc, argv, B_FALSE)); } typedef struct holds_cbdata { boolean_t cb_recursive; const char *cb_snapname; nvlist_t **cb_nvlp; size_t cb_max_namelen; size_t cb_max_taglen; } holds_cbdata_t; #define STRFTIME_FMT_STR "%a %b %e %H:%M %Y" #define DATETIME_BUF_LEN (32) /* * */ static void print_holds(boolean_t scripted, int nwidth, int tagwidth, nvlist_t *nvl, boolean_t parsable) { int i; nvpair_t *nvp = NULL; const char *const hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" }; const char *col; if (!scripted) { for (i = 0; i < 3; i++) { col = gettext(hdr_cols[i]); if (i < 2) (void) printf("%-*s ", i ? tagwidth : nwidth, col); else (void) printf("%s\n", col); } } while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { const char *zname = nvpair_name(nvp); nvlist_t *nvl2; nvpair_t *nvp2 = NULL; (void) nvpair_value_nvlist(nvp, &nvl2); while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) { char tsbuf[DATETIME_BUF_LEN]; const char *tagname = nvpair_name(nvp2); uint64_t val = 0; time_t time; struct tm t; (void) nvpair_value_uint64(nvp2, &val); time = (time_t)val; (void) localtime_r(&time, &t); (void) strftime(tsbuf, DATETIME_BUF_LEN, gettext(STRFTIME_FMT_STR), &t); if (scripted) { if (parsable) { (void) printf("%s\t%s\t%lld\n", zname, tagname, (long long)time); } else { (void) printf("%s\t%s\t%s\n", zname, tagname, tsbuf); } } else { if (parsable) { (void) printf("%-*s %-*s %lld\n", nwidth, zname, tagwidth, tagname, (long long)time); } else { (void) printf("%-*s %-*s %s\n", nwidth, zname, tagwidth, tagname, tsbuf); } } } } } /* * Generic callback function to list a dataset or snapshot. */ static int holds_callback(zfs_handle_t *zhp, void *data) { holds_cbdata_t *cbp = data; nvlist_t *top_nvl = *cbp->cb_nvlp; nvlist_t *nvl = NULL; nvpair_t *nvp = NULL; const char *zname = zfs_get_name(zhp); size_t znamelen = strlen(zname); if (cbp->cb_recursive) { const char *snapname; char *delim = strchr(zname, '@'); if (delim == NULL) return (0); snapname = delim + 1; if (strcmp(cbp->cb_snapname, snapname)) return (0); } if (zfs_get_holds(zhp, &nvl) != 0) return (-1); if (znamelen > cbp->cb_max_namelen) cbp->cb_max_namelen = znamelen; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { const char *tag = nvpair_name(nvp); size_t taglen = strlen(tag); if (taglen > cbp->cb_max_taglen) cbp->cb_max_taglen = taglen; } return (nvlist_add_nvlist(top_nvl, zname, nvl)); } /* * zfs holds [-rHp] ... * * -r Lists holds that are set on the named snapshots recursively. * -H Scripted mode; elide headers and separate columns by tabs. * -p Display values in parsable (literal) format. */ static int zfs_do_holds(int argc, char **argv) { int c; boolean_t errors = B_FALSE; boolean_t scripted = B_FALSE; boolean_t recursive = B_FALSE; boolean_t parsable = B_FALSE; int types = ZFS_TYPE_SNAPSHOT; holds_cbdata_t cb = { 0 }; int limit = 0; int ret = 0; int flags = 0; /* check options */ while ((c = getopt(argc, argv, "rHp")) != -1) { switch (c) { case 'r': recursive = B_TRUE; break; case 'H': scripted = B_TRUE; break; case 'p': parsable = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (recursive) { types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; flags |= ZFS_ITER_RECURSE; } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) usage(B_FALSE); nvlist_t *nvl = fnvlist_alloc(); for (int i = 0; i < argc; ++i) { char *snapshot = argv[i]; const char *delim; const char *snapname; delim = strchr(snapshot, '@'); if (delim == NULL) { (void) fprintf(stderr, gettext("'%s' is not a snapshot\n"), snapshot); errors = B_TRUE; continue; } snapname = delim + 1; if (recursive) snapshot[delim - snapshot] = '\0'; cb.cb_recursive = recursive; cb.cb_snapname = snapname; cb.cb_nvlp = &nvl; /* * 1. collect holds data, set format options */ ret = zfs_for_each(1, argv + i, flags, types, NULL, NULL, limit, holds_callback, &cb); if (ret != 0) errors = B_TRUE; } /* * 2. print holds data */ print_holds(scripted, cb.cb_max_namelen, cb.cb_max_taglen, nvl, parsable); if (nvlist_empty(nvl)) (void) fprintf(stderr, gettext("no datasets available\n")); nvlist_free(nvl); return (errors); } #define CHECK_SPINNER 30 #define SPINNER_TIME 3 /* seconds */ #define MOUNT_TIME 1 /* seconds */ typedef struct get_all_state { char **ga_datasets; int ga_count; boolean_t ga_verbose; get_all_cb_t *ga_cbp; } get_all_state_t; static int get_one_dataset(zfs_handle_t *zhp, void *data) { static const char *const spin[] = { "-", "\\", "|", "/" }; static int spinval = 0; static int spincheck = 0; static time_t last_spin_time = (time_t)0; get_all_state_t *state = data; zfs_type_t type = zfs_get_type(zhp); if (state->ga_verbose) { if (--spincheck < 0) { time_t now = time(NULL); if (last_spin_time + SPINNER_TIME < now) { update_progress(spin[spinval++ % 4]); last_spin_time = now; } spincheck = CHECK_SPINNER; } } /* * Iterate over any nested datasets. */ if (zfs_iter_filesystems_v2(zhp, 0, get_one_dataset, data) != 0) { zfs_close(zhp); return (1); } /* * Skip any datasets whose type does not match. */ if ((type & ZFS_TYPE_FILESYSTEM) == 0) { zfs_close(zhp); return (0); } libzfs_add_handle(state->ga_cbp, zhp); assert(state->ga_cbp->cb_used <= state->ga_cbp->cb_alloc); return (0); } static int get_recursive_datasets(zfs_handle_t *zhp, void *data) { get_all_state_t *state = data; int len = strlen(zfs_get_name(zhp)); for (int i = 0; i < state->ga_count; ++i) { if (strcmp(state->ga_datasets[i], zfs_get_name(zhp)) == 0) return (get_one_dataset(zhp, data)); else if ((strncmp(state->ga_datasets[i], zfs_get_name(zhp), len) == 0) && state->ga_datasets[i][len] == '/') { (void) zfs_iter_filesystems_v2(zhp, 0, get_recursive_datasets, data); } } zfs_close(zhp); return (0); } static void get_all_datasets(get_all_state_t *state) { if (state->ga_verbose) set_progress_header(gettext("Reading ZFS config")); if (state->ga_datasets == NULL) (void) zfs_iter_root(g_zfs, get_one_dataset, state); else (void) zfs_iter_root(g_zfs, get_recursive_datasets, state); if (state->ga_verbose) finish_progress(gettext("done.")); } /* * Generic callback for sharing or mounting filesystems. Because the code is so * similar, we have a common function with an extra parameter to determine which * mode we are using. */ typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t; typedef struct share_mount_state { share_mount_op_t sm_op; boolean_t sm_verbose; int sm_flags; char *sm_options; enum sa_protocol sm_proto; /* only valid for OP_SHARE */ pthread_mutex_t sm_lock; /* protects the remaining fields */ uint_t sm_total; /* number of filesystems to process */ uint_t sm_done; /* number of filesystems processed */ int sm_status; /* -1 if any of the share/mount operations failed */ } share_mount_state_t; /* * Share or mount a dataset. */ static int share_mount_one(zfs_handle_t *zhp, int op, int flags, enum sa_protocol protocol, boolean_t explicit, const char *options) { char mountpoint[ZFS_MAXPROPLEN]; char shareopts[ZFS_MAXPROPLEN]; char smbshareopts[ZFS_MAXPROPLEN]; const char *cmdname = op == OP_SHARE ? "share" : "mount"; struct mnttab mnt; uint64_t zoned, canmount; boolean_t shared_nfs, shared_smb; assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM); /* * Check to make sure we can mount/share this dataset. If we * are in the global zone and the filesystem is exported to a * local zone, or if we are in a local zone and the * filesystem is not exported, then it is an error. */ zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); if (zoned && getzoneid() == GLOBAL_ZONEID) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "dataset is exported to a local zone\n"), cmdname, zfs_get_name(zhp)); return (1); } else if (!zoned && getzoneid() != GLOBAL_ZONEID) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "permission denied\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * Ignore any filesystems which don't apply to us. This * includes those with a legacy mountpoint, or those with * legacy share options. */ verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts, sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts, sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0); if (op == OP_SHARE && strcmp(shareopts, "off") == 0 && strcmp(smbshareopts, "off") == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot share '%s': " "legacy share\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use exports(5) or " "smb.conf(5) to share this filesystem, or set " "the sharenfs or sharesmb property\n")); return (1); } /* * We cannot share or mount legacy filesystems. If the * shareopts is non-legacy but the mountpoint is legacy, we * treat it as a legacy share. */ if (strcmp(mountpoint, "legacy") == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "legacy mountpoint\n"), cmdname, zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use %s(8) to " "%s this filesystem\n"), cmdname, cmdname); return (1); } if (strcmp(mountpoint, "none") == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': no " "mountpoint set\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * canmount explicit outcome * on no pass through * on yes pass through * off no return 0 * off yes display error, return 1 * noauto no return 0 * noauto yes pass through */ canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT); if (canmount == ZFS_CANMOUNT_OFF) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "'canmount' property is set to 'off'\n"), cmdname, zfs_get_name(zhp)); return (1); } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) { /* * When performing a 'zfs mount -a', we skip any mounts for * datasets that have 'noauto' set. Sharing a dataset with * 'noauto' set is only allowed if it's mounted. */ if (op == OP_MOUNT) return (0); if (op == OP_SHARE && !zfs_is_mounted(zhp, NULL)) { /* also purge it from existing exports */ zfs_unshare(zhp, mountpoint, NULL); return (0); } } /* * If this filesystem is encrypted and does not have * a loaded key, we can not mount it. */ if ((flags & MS_CRYPT) == 0 && zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF && zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) == ZFS_KEYSTATUS_UNAVAILABLE) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "encryption key not loaded\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * If this filesystem is inconsistent and has a receive resume * token, we can not mount it. */ if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0, NULL, NULL, 0, B_TRUE) == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "Contains partially-completed state from " "\"zfs receive -s\", which can be resumed with " "\"zfs send -t\"\n"), cmdname, zfs_get_name(zhp)); return (1); } if (zfs_prop_get_int(zhp, ZFS_PROP_REDACTED) && !(flags & MS_FORCE)) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "Dataset is not complete, was created by receiving " "a redacted zfs send stream.\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * At this point, we have verified that the mountpoint and/or * shareopts are appropriate for auto management. If the * filesystem is already mounted or shared, return (failing * for explicit requests); otherwise mount or share the * filesystem. */ switch (op) { case OP_SHARE: { enum sa_protocol prot[] = {SA_PROTOCOL_NFS, SA_NO_PROTOCOL}; shared_nfs = zfs_is_shared(zhp, NULL, prot); *prot = SA_PROTOCOL_SMB; shared_smb = zfs_is_shared(zhp, NULL, prot); if ((shared_nfs && shared_smb) || (shared_nfs && strcmp(shareopts, "on") == 0 && strcmp(smbshareopts, "off") == 0) || (shared_smb && strcmp(smbshareopts, "on") == 0 && strcmp(shareopts, "off") == 0)) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot share " "'%s': filesystem already shared\n"), zfs_get_name(zhp)); return (1); } if (!zfs_is_mounted(zhp, NULL) && zfs_mount(zhp, NULL, flags) != 0) return (1); *prot = protocol; if (zfs_share(zhp, protocol == SA_NO_PROTOCOL ? NULL : prot)) return (1); } break; case OP_MOUNT: mnt.mnt_mntopts = (char *)(options ?: ""); if (!hasmntopt(&mnt, MNTOPT_REMOUNT) && zfs_is_mounted(zhp, NULL)) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot mount " "'%s': filesystem already mounted\n"), zfs_get_name(zhp)); return (1); } if (zfs_mount(zhp, options, flags) != 0) return (1); break; } return (0); } /* * Reports progress in the form "(current/total)". Not thread-safe. */ static void report_mount_progress(int current, int total) { static time_t last_progress_time = 0; time_t now = time(NULL); char info[32]; /* display header if we're here for the first time */ if (current == 1) { set_progress_header(gettext("Mounting ZFS filesystems")); } else if (current != total && last_progress_time + MOUNT_TIME >= now) { /* too soon to report again */ return; } last_progress_time = now; (void) sprintf(info, "(%d/%d)", current, total); if (current == total) finish_progress(info); else update_progress(info); } /* * zfs_foreach_mountpoint() callback that mounts or shares one filesystem and * updates the progress meter. */ static int share_mount_one_cb(zfs_handle_t *zhp, void *arg) { share_mount_state_t *sms = arg; int ret; ret = share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto, B_FALSE, sms->sm_options); pthread_mutex_lock(&sms->sm_lock); if (ret != 0) sms->sm_status = ret; sms->sm_done++; if (sms->sm_verbose) report_mount_progress(sms->sm_done, sms->sm_total); pthread_mutex_unlock(&sms->sm_lock); return (ret); } static void append_options(char *mntopts, char *newopts) { int len = strlen(mntopts); /* original length plus new string to append plus 1 for the comma */ if (len + 1 + strlen(newopts) >= MNT_LINE_MAX) { (void) fprintf(stderr, gettext("the opts argument for " "'%s' option is too long (more than %d chars)\n"), "-o", MNT_LINE_MAX); usage(B_FALSE); } if (*mntopts) mntopts[len++] = ','; (void) strcpy(&mntopts[len], newopts); } static enum sa_protocol sa_protocol_decode(const char *protocol) { for (enum sa_protocol i = 0; i < ARRAY_SIZE(sa_protocol_names); ++i) if (strcmp(protocol, sa_protocol_names[i]) == 0) return (i); (void) fputs(gettext("share type must be one of: "), stderr); for (enum sa_protocol i = 0; i < ARRAY_SIZE(sa_protocol_names); ++i) (void) fprintf(stderr, "%s%s", i != 0 ? ", " : "", sa_protocol_names[i]); (void) fputc('\n', stderr); usage(B_FALSE); } static int share_mount(int op, int argc, char **argv) { int do_all = 0; int recursive = 0; boolean_t verbose = B_FALSE; boolean_t json = B_FALSE; int c, ret = 0; char *options = NULL; int flags = 0; nvlist_t *jsobj, *data, *item; const uint_t mount_nthr = 512; uint_t nthr; jsobj = data = item = NULL; struct option long_options[] = { {"json", no_argument, NULL, 'j'}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, op == OP_MOUNT ? ":ajRlvo:Of" : "al", op == OP_MOUNT ? long_options : NULL, NULL)) != -1) { switch (c) { case 'a': do_all = 1; break; case 'R': recursive = 1; break; case 'v': verbose = B_TRUE; break; case 'l': flags |= MS_CRYPT; break; case 'j': json = B_TRUE; jsobj = zfs_json_schema(0, 1); data = fnvlist_alloc(); break; case 'o': if (*optarg == '\0') { (void) fprintf(stderr, gettext("empty mount " "options (-o) specified\n")); usage(B_FALSE); } if (options == NULL) options = safe_malloc(MNT_LINE_MAX + 1); /* option validation is done later */ append_options(options, optarg); break; case 'O': flags |= MS_OVERLAY; break; case 'f': flags |= MS_FORCE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (json && argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* check number of arguments */ if (do_all || recursive) { enum sa_protocol protocol = SA_NO_PROTOCOL; if (op == OP_SHARE && argc > 0) { protocol = sa_protocol_decode(argv[0]); argc--; argv++; } if (argc != 0 && do_all) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (argc == 0 && recursive) { (void) fprintf(stderr, gettext("no dataset provided\n")); usage(B_FALSE); } start_progress_timer(); get_all_cb_t cb = { 0 }; get_all_state_t state = { 0 }; if (argc == 0) { state.ga_datasets = NULL; state.ga_count = -1; } else { zfs_handle_t *zhp; for (int i = 0; i < argc; i++) { zhp = zfs_open(g_zfs, argv[i], ZFS_TYPE_FILESYSTEM); if (zhp == NULL) usage(B_FALSE); zfs_close(zhp); } state.ga_datasets = argv; state.ga_count = argc; } state.ga_verbose = verbose; state.ga_cbp = &cb; get_all_datasets(&state); if (cb.cb_used == 0) { free(options); return (0); } share_mount_state_t share_mount_state = { 0 }; share_mount_state.sm_op = op; share_mount_state.sm_verbose = verbose; share_mount_state.sm_flags = flags; share_mount_state.sm_options = options; share_mount_state.sm_proto = protocol; share_mount_state.sm_total = cb.cb_used; pthread_mutex_init(&share_mount_state.sm_lock, NULL); /* For a 'zfs share -a' operation start with a clean slate. */ if (op == OP_SHARE) zfs_truncate_shares(NULL); /* * libshare isn't mt-safe, so only do the operation in parallel * if we're mounting. Additionally, the key-loading option must * be serialized so that we can prompt the user for their keys * in a consistent manner. */ nthr = op == OP_MOUNT && !(flags & MS_CRYPT) ? mount_nthr : 1; zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used, share_mount_one_cb, &share_mount_state, nthr); zfs_commit_shares(NULL); ret = share_mount_state.sm_status; for (int i = 0; i < cb.cb_used; i++) zfs_close(cb.cb_handles[i]); free(cb.cb_handles); } else if (argc == 0) { FILE *mnttab; struct mnttab entry; if ((op == OP_SHARE) || (options != NULL)) { (void) fprintf(stderr, gettext("missing filesystem " "argument (specify -a for all)\n")); usage(B_FALSE); } /* * When mount is given no arguments, go through * /proc/self/mounts and display any active ZFS mounts. * We hide any snapshots, since they are controlled * automatically. */ if ((mnttab = fopen(MNTTAB, "re")) == NULL) { free(options); return (ENOENT); } while (getmntent(mnttab, &entry) == 0) { if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 || strchr(entry.mnt_special, '@') != NULL) continue; if (json) { item = fnvlist_alloc(); fnvlist_add_string(item, "filesystem", entry.mnt_special); fnvlist_add_string(item, "mountpoint", entry.mnt_mountp); fnvlist_add_nvlist(data, entry.mnt_special, item); fnvlist_free(item); } else { (void) printf("%-30s %s\n", entry.mnt_special, entry.mnt_mountp); } } (void) fclose(mnttab); if (json) { fnvlist_add_nvlist(jsobj, "datasets", data); if (nvlist_empty(data)) fnvlist_free(jsobj); else zcmd_print_json(jsobj); fnvlist_free(data); } } else { zfs_handle_t *zhp; if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM)) == NULL) { ret = 1; } else { ret = share_mount_one(zhp, op, flags, SA_NO_PROTOCOL, B_TRUE, options); zfs_commit_shares(NULL); zfs_close(zhp); } } free(options); return (ret); } /* * zfs mount -a * zfs mount filesystem * * Mount all filesystems, or mount the given filesystem. */ static int zfs_do_mount(int argc, char **argv) { return (share_mount(OP_MOUNT, argc, argv)); } /* * zfs share -a [nfs | smb] * zfs share filesystem * * Share all filesystems, or share the given filesystem. */ static int zfs_do_share(int argc, char **argv) { return (share_mount(OP_SHARE, argc, argv)); } typedef struct unshare_unmount_node { zfs_handle_t *un_zhp; char *un_mountp; uu_avl_node_t un_avlnode; } unshare_unmount_node_t; static int unshare_unmount_compare(const void *larg, const void *rarg, void *unused) { (void) unused; const unshare_unmount_node_t *l = larg; const unshare_unmount_node_t *r = rarg; return (strcmp(l->un_mountp, r->un_mountp)); } /* * Convenience routine used by zfs_do_umount() and manual_unmount(). Given an * absolute path, find the entry /proc/self/mounts, verify that it's a * ZFS filesystem, and unmount it appropriately. */ static int unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) { zfs_handle_t *zhp; int ret = 0; struct stat64 statbuf; struct extmnttab entry; const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount"; ino_t path_inode; char *zfs_mntpnt, *entry_mntpnt; /* * Search for the given (major,minor) pair in the mount table. */ if (getextmntent(path, &entry, &statbuf) != 0) { if (op == OP_SHARE) { (void) fprintf(stderr, gettext("cannot %s '%s': not " "currently mounted\n"), cmdname, path); return (1); } (void) fprintf(stderr, gettext("warning: %s not in" "/proc/self/mounts\n"), path); if ((ret = umount2(path, flags)) != 0) (void) fprintf(stderr, gettext("%s: %s\n"), path, strerror(errno)); return (ret != 0); } path_inode = statbuf.st_ino; if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { (void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS " "filesystem\n"), cmdname, path); return (1); } if ((zhp = zfs_open(g_zfs, entry.mnt_special, ZFS_TYPE_FILESYSTEM)) == NULL) return (1); ret = 1; if (stat64(entry.mnt_mountp, &statbuf) != 0) { (void) fprintf(stderr, gettext("cannot %s '%s': %s\n"), cmdname, path, strerror(errno)); goto out; } else if (statbuf.st_ino != path_inode) { (void) fprintf(stderr, gettext("cannot " "%s '%s': not a mountpoint\n"), cmdname, path); goto out; } /* * If the filesystem is mounted, check that the mountpoint matches * the one in the mnttab entry w.r.t. provided path. If it doesn't, * then we should not proceed further. */ entry_mntpnt = strdup(entry.mnt_mountp); if (zfs_is_mounted(zhp, &zfs_mntpnt)) { if (strcmp(zfs_mntpnt, entry_mntpnt) != 0) { (void) fprintf(stderr, gettext("cannot %s '%s': " "not an original mountpoint\n"), cmdname, path); free(zfs_mntpnt); free(entry_mntpnt); goto out; } free(zfs_mntpnt); } free(entry_mntpnt); if (op == OP_SHARE) { char nfs_mnt_prop[ZFS_MAXPROPLEN]; char smbshare_prop[ZFS_MAXPROPLEN]; verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshare_prop, sizeof (smbshare_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") == 0 && strcmp(smbshare_prop, "off") == 0) { (void) fprintf(stderr, gettext("cannot unshare " "'%s': legacy share\n"), path); (void) fprintf(stderr, gettext("use exportfs(8) " "or smbcontrol(1) to unshare this filesystem\n")); } else if (!zfs_is_shared(zhp, NULL, NULL)) { (void) fprintf(stderr, gettext("cannot unshare '%s': " "not currently shared\n"), path); } else { ret = zfs_unshare(zhp, path, NULL); zfs_commit_shares(NULL); } } else { char mtpt_prop[ZFS_MAXPROPLEN]; verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mtpt_prop, sizeof (mtpt_prop), NULL, NULL, 0, B_FALSE) == 0); if (is_manual) { ret = zfs_unmount(zhp, NULL, flags); } else if (strcmp(mtpt_prop, "legacy") == 0) { (void) fprintf(stderr, gettext("cannot unmount " "'%s': legacy mountpoint\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use umount(8) " "to unmount this filesystem\n")); } else { ret = zfs_unmountall(zhp, flags); } } out: zfs_close(zhp); return (ret != 0); } /* * Generic callback for unsharing or unmounting a filesystem. */ static int unshare_unmount(int op, int argc, char **argv) { int do_all = 0; int flags = 0; int ret = 0; int c; zfs_handle_t *zhp; char nfs_mnt_prop[ZFS_MAXPROPLEN]; char sharesmb[ZFS_MAXPROPLEN]; /* check options */ while ((c = getopt(argc, argv, op == OP_SHARE ? ":a" : "afu")) != -1) { switch (c) { case 'a': do_all = 1; break; case 'f': flags |= MS_FORCE; break; case 'u': flags |= MS_CRYPT; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (do_all) { /* * We could make use of zfs_for_each() to walk all datasets in * the system, but this would be very inefficient, especially * since we would have to linearly search /proc/self/mounts for * each one. Instead, do one pass through /proc/self/mounts * looking for zfs entries and call zfs_unmount() for each one. * * Things get a little tricky if the administrator has created * mountpoints beneath other ZFS filesystems. In this case, we * have to unmount the deepest filesystems first. To accomplish * this, we place all the mountpoints in an AVL tree sorted by * the special type (dataset name), and walk the result in * reverse to make sure to get any snapshots first. */ FILE *mnttab; struct mnttab entry; uu_avl_pool_t *pool; uu_avl_t *tree = NULL; unshare_unmount_node_t *node; uu_avl_index_t idx; uu_avl_walk_t *walk; enum sa_protocol *protocol = NULL, single_protocol[] = {SA_NO_PROTOCOL, SA_NO_PROTOCOL}; if (op == OP_SHARE && argc > 0) { *single_protocol = sa_protocol_decode(argv[0]); protocol = single_protocol; argc--; argv++; } if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (((pool = uu_avl_pool_create("unmount_pool", sizeof (unshare_unmount_node_t), offsetof(unshare_unmount_node_t, un_avlnode), unshare_unmount_compare, UU_DEFAULT)) == NULL) || ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL)) nomem(); if ((mnttab = fopen(MNTTAB, "re")) == NULL) { uu_avl_destroy(tree); uu_avl_pool_destroy(pool); return (ENOENT); } while (getmntent(mnttab, &entry) == 0) { /* ignore non-ZFS entries */ if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) continue; /* ignore snapshots */ if (strchr(entry.mnt_special, '@') != NULL) continue; if ((zhp = zfs_open(g_zfs, entry.mnt_special, ZFS_TYPE_FILESYSTEM)) == NULL) { ret = 1; continue; } /* * Ignore datasets that are excluded/restricted by * parent pool name. */ if (zpool_skip_pool(zfs_get_pool_name(zhp))) { zfs_close(zhp); continue; } switch (op) { case OP_SHARE: verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") != 0) break; verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") == 0) continue; break; case OP_MOUNT: /* Ignore legacy mounts */ verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "legacy") == 0) continue; /* Ignore canmount=noauto mounts */ if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_NOAUTO) continue; break; default: break; } node = safe_malloc(sizeof (unshare_unmount_node_t)); node->un_zhp = zhp; node->un_mountp = safe_strdup(entry.mnt_mountp); uu_avl_node_init(node, &node->un_avlnode, pool); if (uu_avl_find(tree, node, NULL, &idx) == NULL) { uu_avl_insert(tree, node, idx); } else { zfs_close(node->un_zhp); free(node->un_mountp); free(node); } } (void) fclose(mnttab); /* * Walk the AVL tree in reverse, unmounting each filesystem and * removing it from the AVL tree in the process. */ if ((walk = uu_avl_walk_start(tree, UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) nomem(); while ((node = uu_avl_walk_next(walk)) != NULL) { const char *mntarg = NULL; uu_avl_remove(tree, node); switch (op) { case OP_SHARE: if (zfs_unshare(node->un_zhp, node->un_mountp, protocol) != 0) ret = 1; break; case OP_MOUNT: if (zfs_unmount(node->un_zhp, mntarg, flags) != 0) ret = 1; break; } zfs_close(node->un_zhp); free(node->un_mountp); free(node); } if (op == OP_SHARE) zfs_commit_shares(protocol); uu_avl_walk_end(walk); uu_avl_destroy(tree); uu_avl_pool_destroy(pool); } else { if (argc != 1) { if (argc == 0) (void) fprintf(stderr, gettext("missing filesystem argument\n")); else (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* * We have an argument, but it may be a full path or a ZFS * filesystem. Pass full paths off to unmount_path() (shared by * manual_unmount), otherwise open the filesystem and pass to * zfs_unmount(). */ if (argv[0][0] == '/') return (unshare_unmount_path(op, argv[0], flags, B_FALSE)); if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM)) == NULL) return (1); verify(zfs_prop_get(zhp, op == OP_SHARE ? ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); switch (op) { case OP_SHARE: verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, sharesmb, sizeof (sharesmb), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") == 0 && strcmp(sharesmb, "off") == 0) { (void) fprintf(stderr, gettext("cannot " "unshare '%s': legacy share\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use " "exports(5) or smb.conf(5) to unshare " "this filesystem\n")); ret = 1; } else if (!zfs_is_shared(zhp, NULL, NULL)) { (void) fprintf(stderr, gettext("cannot " "unshare '%s': not currently " "shared\n"), zfs_get_name(zhp)); ret = 1; } else if (zfs_unshareall(zhp, NULL) != 0) { ret = 1; } break; case OP_MOUNT: if (strcmp(nfs_mnt_prop, "legacy") == 0) { (void) fprintf(stderr, gettext("cannot " "unmount '%s': legacy " "mountpoint\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use " "umount(8) to unmount this " "filesystem\n")); ret = 1; } else if (!zfs_is_mounted(zhp, NULL)) { (void) fprintf(stderr, gettext("cannot " "unmount '%s': not currently " "mounted\n"), zfs_get_name(zhp)); ret = 1; } else if (zfs_unmountall(zhp, flags) != 0) { ret = 1; } break; } zfs_close(zhp); } return (ret); } /* * zfs unmount [-fu] -a * zfs unmount [-fu] filesystem * * Unmount all filesystems, or a specific ZFS filesystem. */ static int zfs_do_unmount(int argc, char **argv) { return (unshare_unmount(OP_MOUNT, argc, argv)); } /* * zfs unshare -a * zfs unshare filesystem * * Unshare all filesystems, or a specific ZFS filesystem. */ static int zfs_do_unshare(int argc, char **argv) { return (unshare_unmount(OP_SHARE, argc, argv)); } static int find_command_idx(const char *command, int *idx) { int i; for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) continue; if (strcmp(command, command_table[i].name) == 0) { *idx = i; return (0); } } return (1); } static int zfs_do_diff(int argc, char **argv) { zfs_handle_t *zhp; int flags = 0; char *tosnap = NULL; char *fromsnap = NULL; char *atp, *copy; int err = 0; int c; struct sigaction sa; while ((c = getopt(argc, argv, "FHth")) != -1) { switch (c) { case 'F': flags |= ZFS_DIFF_CLASSIFY; break; case 'H': flags |= ZFS_DIFF_PARSEABLE; break; case 't': flags |= ZFS_DIFF_TIMESTAMP; break; case 'h': flags |= ZFS_DIFF_NO_MANGLE; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("must provide at least one snapshot name\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } fromsnap = argv[0]; tosnap = (argc == 2) ? argv[1] : NULL; copy = NULL; if (*fromsnap != '@') copy = strdup(fromsnap); else if (tosnap) copy = strdup(tosnap); if (copy == NULL) usage(B_FALSE); if ((atp = strchr(copy, '@')) != NULL) *atp = '\0'; if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL) { free(copy); return (1); } free(copy); /* * Ignore SIGPIPE so that the library can give us * information on any failure */ if (sigemptyset(&sa.sa_mask) == -1) { err = errno; goto out; } sa.sa_flags = 0; sa.sa_handler = SIG_IGN; if (sigaction(SIGPIPE, &sa, NULL) == -1) { err = errno; goto out; } err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags); out: zfs_close(zhp); return (err != 0); } /* * zfs bookmark | * * Creates a bookmark with the given name from the source snapshot * or creates a copy of an existing source bookmark. */ static int zfs_do_bookmark(int argc, char **argv) { char *source, *bookname; char expbuf[ZFS_MAX_DATASET_NAME_LEN]; int source_type; nvlist_t *nvl; int ret = 0; int c; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing source argument\n")); goto usage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing bookmark argument\n")); goto usage; } source = argv[0]; bookname = argv[1]; if (strchr(source, '@') == NULL && strchr(source, '#') == NULL) { (void) fprintf(stderr, gettext("invalid source name '%s': " "must contain a '@' or '#'\n"), source); goto usage; } if (strchr(bookname, '#') == NULL) { (void) fprintf(stderr, gettext("invalid bookmark name '%s': " "must contain a '#'\n"), bookname); goto usage; } /* * expand source or bookname to full path: * one of them may be specified as short name */ { char **expand; char *source_short, *bookname_short; source_short = strpbrk(source, "@#"); bookname_short = strpbrk(bookname, "#"); if (source_short == source && bookname_short == bookname) { (void) fprintf(stderr, gettext( "either source or bookmark must be specified as " "full dataset paths")); goto usage; } else if (source_short != source && bookname_short != bookname) { expand = NULL; } else if (source_short != source) { strlcpy(expbuf, source, sizeof (expbuf)); expand = &bookname; } else if (bookname_short != bookname) { strlcpy(expbuf, bookname, sizeof (expbuf)); expand = &source; } else { abort(); } if (expand != NULL) { *strpbrk(expbuf, "@#") = '\0'; /* dataset name in buf */ (void) strlcat(expbuf, *expand, sizeof (expbuf)); *expand = expbuf; } } /* determine source type */ switch (*strpbrk(source, "@#")) { case '@': source_type = ZFS_TYPE_SNAPSHOT; break; case '#': source_type = ZFS_TYPE_BOOKMARK; break; default: abort(); } /* test the source exists */ zfs_handle_t *zhp; zhp = zfs_open(g_zfs, source, source_type); if (zhp == NULL) goto usage; zfs_close(zhp); nvl = fnvlist_alloc(); fnvlist_add_string(nvl, bookname, source); ret = lzc_bookmark(nvl, NULL); fnvlist_free(nvl); if (ret != 0) { const char *err_msg = NULL; char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create bookmark '%s'"), bookname); switch (ret) { case EXDEV: err_msg = "bookmark is in a different pool"; break; case ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR: err_msg = "source is not an ancestor of the " "new bookmark's dataset"; break; case EEXIST: err_msg = "bookmark exists"; break; case EINVAL: err_msg = "invalid argument"; break; case ENOTSUP: err_msg = "bookmark feature not enabled"; break; case ENOSPC: err_msg = "out of space"; break; case ENOENT: err_msg = "dataset does not exist"; break; default: (void) zfs_standard_error(g_zfs, ret, errbuf); break; } if (err_msg != NULL) { (void) fprintf(stderr, "%s: %s\n", errbuf, dgettext(TEXT_DOMAIN, err_msg)); } } return (ret != 0); usage: usage(B_FALSE); return (-1); } static int zfs_do_channel_program(int argc, char **argv) { int ret, fd, c; size_t progsize, progread; nvlist_t *outnvl = NULL; uint64_t instrlimit = ZCP_DEFAULT_INSTRLIMIT; uint64_t memlimit = ZCP_DEFAULT_MEMLIMIT; boolean_t sync_flag = B_TRUE, json_output = B_FALSE; zpool_handle_t *zhp; struct option long_options[] = { {"json", no_argument, NULL, 'j'}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, "nt:m:j", long_options, NULL)) != -1) { switch (c) { case 't': case 'm': { uint64_t arg; char *endp; errno = 0; arg = strtoull(optarg, &endp, 0); if (errno != 0 || *endp != '\0') { (void) fprintf(stderr, gettext( "invalid argument " "'%s': expected integer\n"), optarg); goto usage; } if (c == 't') { instrlimit = arg; } else { ASSERT3U(c, ==, 'm'); memlimit = arg; } break; } case 'n': { sync_flag = B_FALSE; break; } case 'j': { json_output = B_TRUE; break; } case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; if (argc < 2) { (void) fprintf(stderr, gettext("invalid number of arguments\n")); goto usage; } const char *poolname = argv[0]; const char *filename = argv[1]; if (strcmp(filename, "-") == 0) { fd = 0; filename = "standard input"; } else if ((fd = open(filename, O_RDONLY)) < 0) { (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), filename, strerror(errno)); return (1); } if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { (void) fprintf(stderr, gettext("cannot open pool '%s'\n"), poolname); if (fd != 0) (void) close(fd); return (1); } zpool_close(zhp); /* * Read in the channel program, expanding the program buffer as * necessary. */ progread = 0; progsize = 1024; char *progbuf = safe_malloc(progsize); do { ret = read(fd, progbuf + progread, progsize - progread); progread += ret; if (progread == progsize && ret > 0) { progsize *= 2; progbuf = safe_realloc(progbuf, progsize); } } while (ret > 0); if (fd != 0) (void) close(fd); if (ret < 0) { free(progbuf); (void) fprintf(stderr, gettext("cannot read '%s': %s\n"), filename, strerror(errno)); return (1); } progbuf[progread] = '\0'; /* * Any remaining arguments are passed as arguments to the lua script as * a string array: * { * "argv" -> [ "arg 1", ... "arg n" ], * } */ nvlist_t *argnvl = fnvlist_alloc(); fnvlist_add_string_array(argnvl, ZCP_ARG_CLIARGV, (const char **)argv + 2, argc - 2); if (sync_flag) { ret = lzc_channel_program(poolname, progbuf, instrlimit, memlimit, argnvl, &outnvl); } else { ret = lzc_channel_program_nosync(poolname, progbuf, instrlimit, memlimit, argnvl, &outnvl); } if (ret != 0) { /* * On error, report the error message handed back by lua if one * exists. Otherwise, generate an appropriate error message, * falling back on strerror() for an unexpected return code. */ const char *errstring = NULL; const char *msg = gettext("Channel program execution failed"); uint64_t instructions = 0; if (outnvl != NULL && nvlist_exists(outnvl, ZCP_RET_ERROR)) { const char *es = NULL; (void) nvlist_lookup_string(outnvl, ZCP_RET_ERROR, &es); if (es == NULL) errstring = strerror(ret); else errstring = es; if (ret == ETIME) { (void) nvlist_lookup_uint64(outnvl, ZCP_ARG_INSTRLIMIT, &instructions); } } else { switch (ret) { case EINVAL: errstring = "Invalid instruction or memory limit."; break; case ENOMEM: errstring = "Return value too large."; break; case ENOSPC: errstring = "Memory limit exhausted."; break; case ETIME: errstring = "Timed out."; break; case EPERM: errstring = "Permission denied. Channel " "programs must be run as root."; break; default: (void) zfs_standard_error(g_zfs, ret, msg); } } if (errstring != NULL) (void) fprintf(stderr, "%s:\n%s\n", msg, errstring); if (ret == ETIME && instructions != 0) (void) fprintf(stderr, gettext("%llu Lua instructions\n"), (u_longlong_t)instructions); } else { if (json_output) { (void) nvlist_print_json(stdout, outnvl); } else if (nvlist_empty(outnvl)) { (void) fprintf(stdout, gettext("Channel program fully " "executed and did not produce output.\n")); } else { (void) fprintf(stdout, gettext("Channel program fully " "executed and produced output:\n")); dump_nvlist(outnvl, 4); } } free(progbuf); fnvlist_free(outnvl); fnvlist_free(argnvl); return (ret != 0); usage: usage(B_FALSE); return (-1); } typedef struct loadkey_cbdata { boolean_t cb_loadkey; boolean_t cb_recursive; boolean_t cb_noop; char *cb_keylocation; uint64_t cb_numfailed; uint64_t cb_numattempted; } loadkey_cbdata_t; static int load_key_callback(zfs_handle_t *zhp, void *data) { int ret; boolean_t is_encroot; loadkey_cbdata_t *cb = data; uint64_t keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); /* * If we are working recursively, we want to skip loading / unloading * keys for non-encryption roots and datasets whose keys are already * in the desired end-state. */ if (cb->cb_recursive) { ret = zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); if (ret != 0) return (ret); if (!is_encroot) return (0); if ((cb->cb_loadkey && keystatus == ZFS_KEYSTATUS_AVAILABLE) || (!cb->cb_loadkey && keystatus == ZFS_KEYSTATUS_UNAVAILABLE)) return (0); } cb->cb_numattempted++; if (cb->cb_loadkey) ret = zfs_crypto_load_key(zhp, cb->cb_noop, cb->cb_keylocation); else ret = zfs_crypto_unload_key(zhp); if (ret != 0) { cb->cb_numfailed++; return (ret); } return (0); } static int load_unload_keys(int argc, char **argv, boolean_t loadkey) { int c, ret = 0, flags = 0; boolean_t do_all = B_FALSE; loadkey_cbdata_t cb = { 0 }; cb.cb_loadkey = loadkey; while ((c = getopt(argc, argv, "anrL:")) != -1) { /* noop and alternate keylocations only apply to zfs load-key */ if (loadkey) { switch (c) { case 'n': cb.cb_noop = B_TRUE; continue; case 'L': cb.cb_keylocation = optarg; continue; default: break; } } switch (c) { case 'a': do_all = B_TRUE; cb.cb_recursive = B_TRUE; break; case 'r': flags |= ZFS_ITER_RECURSE; cb.cb_recursive = B_TRUE; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (!do_all && argc == 0) { (void) fprintf(stderr, gettext("Missing dataset argument or -a option\n")); usage(B_FALSE); } if (do_all && argc != 0) { (void) fprintf(stderr, gettext("Cannot specify dataset with -a option\n")); usage(B_FALSE); } if (cb.cb_recursive && cb.cb_keylocation != NULL && strcmp(cb.cb_keylocation, "prompt") != 0) { (void) fprintf(stderr, gettext("alternate keylocation may only " "be 'prompt' with -r or -a\n")); usage(B_FALSE); } ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, NULL, NULL, 0, load_key_callback, &cb); if (cb.cb_noop || (cb.cb_recursive && cb.cb_numattempted != 0)) { (void) printf(gettext("%llu / %llu key(s) successfully %s\n"), (u_longlong_t)(cb.cb_numattempted - cb.cb_numfailed), (u_longlong_t)cb.cb_numattempted, loadkey ? (cb.cb_noop ? "verified" : "loaded") : "unloaded"); } if (cb.cb_numfailed != 0) ret = -1; return (ret); } static int zfs_do_load_key(int argc, char **argv) { return (load_unload_keys(argc, argv, B_TRUE)); } static int zfs_do_unload_key(int argc, char **argv) { return (load_unload_keys(argc, argv, B_FALSE)); } static int zfs_do_change_key(int argc, char **argv) { int c, ret; uint64_t keystatus; boolean_t loadkey = B_FALSE, inheritkey = B_FALSE; zfs_handle_t *zhp = NULL; nvlist_t *props = fnvlist_alloc(); while ((c = getopt(argc, argv, "lio:")) != -1) { switch (c) { case 'l': loadkey = B_TRUE; break; case 'i': inheritkey = B_TRUE; break; case 'o': if (!parseprop(props, optarg)) { nvlist_free(props); return (1); } break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (inheritkey && !nvlist_empty(props)) { (void) fprintf(stderr, gettext("Properties not allowed for inheriting\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("Missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("Too many arguments\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[argc - 1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) usage(B_FALSE); if (loadkey) { keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); if (keystatus != ZFS_KEYSTATUS_AVAILABLE) { ret = zfs_crypto_load_key(zhp, B_FALSE, NULL); if (ret != 0) { nvlist_free(props); zfs_close(zhp); return (-1); } } /* refresh the properties so the new keystatus is visible */ zfs_refresh_properties(zhp); } ret = zfs_crypto_rewrap(zhp, props, inheritkey); if (ret != 0) { nvlist_free(props); zfs_close(zhp); return (-1); } nvlist_free(props); zfs_close(zhp); return (0); } /* * 1) zfs project [-d|-r] * List project ID and inherit flag of file(s) or directories. * -d: List the directory itself, not its children. * -r: List subdirectories recursively. * * 2) zfs project -C [-k] [-r] * Clear project inherit flag and/or ID on the file(s) or directories. * -k: Keep the project ID unchanged. If not specified, the project ID * will be reset as zero. * -r: Clear on subdirectories recursively. * * 3) zfs project -c [-0] [-d|-r] [-p id] * Check project ID and inherit flag on the file(s) or directories, * report the outliers. * -0: Print file name followed by a NUL instead of newline. * -d: Check the directory itself, not its children. * -p: Specify the referenced ID for comparing with the target file(s) * or directories' project IDs. If not specified, the target (top) * directory's project ID will be used as the referenced one. * -r: Check subdirectories recursively. * * 4) zfs project [-p id] [-r] [-s] * Set project ID and/or inherit flag on the file(s) or directories. * -p: Set the project ID as the given id. * -r: Set on subdirectories recursively. If not specify "-p" option, * it will use top-level directory's project ID as the given id, * then set both project ID and inherit flag on all descendants * of the top-level directory. * -s: Set project inherit flag. */ static int zfs_do_project(int argc, char **argv) { zfs_project_control_t zpc = { .zpc_expected_projid = ZFS_INVALID_PROJID, .zpc_op = ZFS_PROJECT_OP_DEFAULT, .zpc_dironly = B_FALSE, .zpc_keep_projid = B_FALSE, .zpc_newline = B_TRUE, .zpc_recursive = B_FALSE, .zpc_set_flag = B_FALSE, }; int ret = 0, c; if (argc < 2) usage(B_FALSE); while ((c = getopt(argc, argv, "0Ccdkp:rs")) != -1) { switch (c) { case '0': zpc.zpc_newline = B_FALSE; break; case 'C': if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { (void) fprintf(stderr, gettext("cannot " "specify '-C' '-c' '-s' together\n")); usage(B_FALSE); } zpc.zpc_op = ZFS_PROJECT_OP_CLEAR; break; case 'c': if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { (void) fprintf(stderr, gettext("cannot " "specify '-C' '-c' '-s' together\n")); usage(B_FALSE); } zpc.zpc_op = ZFS_PROJECT_OP_CHECK; break; case 'd': zpc.zpc_dironly = B_TRUE; /* overwrite "-r" option */ zpc.zpc_recursive = B_FALSE; break; case 'k': zpc.zpc_keep_projid = B_TRUE; break; case 'p': { char *endptr; errno = 0; zpc.zpc_expected_projid = strtoull(optarg, &endptr, 0); if (errno != 0 || *endptr != '\0') { (void) fprintf(stderr, gettext("project ID must be less than " "%u\n"), UINT32_MAX); usage(B_FALSE); } if (zpc.zpc_expected_projid >= UINT32_MAX) { (void) fprintf(stderr, gettext("invalid project ID\n")); usage(B_FALSE); } break; } case 'r': zpc.zpc_recursive = B_TRUE; /* overwrite "-d" option */ zpc.zpc_dironly = B_FALSE; break; case 's': if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { (void) fprintf(stderr, gettext("cannot " "specify '-C' '-c' '-s' together\n")); usage(B_FALSE); } zpc.zpc_set_flag = B_TRUE; zpc.zpc_op = ZFS_PROJECT_OP_SET; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (zpc.zpc_op == ZFS_PROJECT_OP_DEFAULT) { if (zpc.zpc_expected_projid != ZFS_INVALID_PROJID) zpc.zpc_op = ZFS_PROJECT_OP_SET; else zpc.zpc_op = ZFS_PROJECT_OP_LIST; } switch (zpc.zpc_op) { case ZFS_PROJECT_OP_LIST: if (zpc.zpc_keep_projid) { (void) fprintf(stderr, gettext("'-k' is only valid together with '-C'\n")); usage(B_FALSE); } if (!zpc.zpc_newline) { (void) fprintf(stderr, gettext("'-0' is only valid together with '-c'\n")); usage(B_FALSE); } break; case ZFS_PROJECT_OP_CHECK: if (zpc.zpc_keep_projid) { (void) fprintf(stderr, gettext("'-k' is only valid together with '-C'\n")); usage(B_FALSE); } break; case ZFS_PROJECT_OP_CLEAR: if (zpc.zpc_dironly) { (void) fprintf(stderr, gettext("'-d' is useless together with '-C'\n")); usage(B_FALSE); } if (!zpc.zpc_newline) { (void) fprintf(stderr, gettext("'-0' is only valid together with '-c'\n")); usage(B_FALSE); } if (zpc.zpc_expected_projid != ZFS_INVALID_PROJID) { (void) fprintf(stderr, gettext("'-p' is useless together with '-C'\n")); usage(B_FALSE); } break; case ZFS_PROJECT_OP_SET: if (zpc.zpc_dironly) { (void) fprintf(stderr, gettext("'-d' is useless for set project ID and/or " "inherit flag\n")); usage(B_FALSE); } if (zpc.zpc_keep_projid) { (void) fprintf(stderr, gettext("'-k' is only valid together with '-C'\n")); usage(B_FALSE); } if (!zpc.zpc_newline) { (void) fprintf(stderr, gettext("'-0' is only valid together with '-c'\n")); usage(B_FALSE); } break; default: ASSERT(0); break; } argv += optind; argc -= optind; if (argc == 0) { (void) fprintf(stderr, gettext("missing file or directory target(s)\n")); usage(B_FALSE); } for (int i = 0; i < argc; i++) { int err; err = zfs_project_handle(argv[i], &zpc); if (err && !ret) ret = err; } return (ret); } static int zfs_rewrite_file(const char *path, boolean_t verbose, zfs_rewrite_args_t *args) { int fd, ret = 0; fd = open(path, O_WRONLY); if (fd < 0) { ret = errno; (void) fprintf(stderr, gettext("failed to open %s: %s\n"), path, strerror(errno)); return (ret); } if (ioctl(fd, ZFS_IOC_REWRITE, args) < 0) { ret = errno; (void) fprintf(stderr, gettext("failed to rewrite %s: %s\n"), path, strerror(errno)); } else if (verbose) { printf("%s\n", path); } close(fd); return (ret); } static int zfs_rewrite_dir(const char *path, boolean_t verbose, boolean_t xdev, dev_t dev, zfs_rewrite_args_t *args, nvlist_t *dirs) { struct dirent *ent; DIR *dir; int ret = 0, err; dir = opendir(path); if (dir == NULL) { if (errno == ENOENT) return (0); ret = errno; (void) fprintf(stderr, gettext("failed to opendir %s: %s\n"), path, strerror(errno)); return (ret); } size_t plen = strlen(path) + 1; while ((ent = readdir(dir)) != NULL) { char *fullname; struct stat st; if (ent->d_type != DT_REG && ent->d_type != DT_DIR) continue; if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) continue; if (plen + strlen(ent->d_name) >= PATH_MAX) { (void) fprintf(stderr, gettext("path too long %s/%s\n"), path, ent->d_name); ret = ENAMETOOLONG; continue; } if (asprintf(&fullname, "%s/%s", path, ent->d_name) == -1) { (void) fprintf(stderr, gettext("failed to allocate memory\n")); ret = ENOMEM; continue; } if (xdev) { if (lstat(fullname, &st) < 0) { ret = errno; (void) fprintf(stderr, gettext("failed to stat %s: %s\n"), fullname, strerror(errno)); free(fullname); continue; } if (st.st_dev != dev) { free(fullname); continue; } } if (ent->d_type == DT_REG) { err = zfs_rewrite_file(fullname, verbose, args); if (err) ret = err; } else { /* DT_DIR */ fnvlist_add_uint64(dirs, fullname, dev); } free(fullname); } closedir(dir); return (ret); } static int zfs_rewrite_path(const char *path, boolean_t verbose, boolean_t recurse, boolean_t xdev, zfs_rewrite_args_t *args, nvlist_t *dirs) { struct stat st; int ret = 0; if (lstat(path, &st) < 0) { ret = errno; (void) fprintf(stderr, gettext("failed to stat %s: %s\n"), path, strerror(errno)); return (ret); } if (S_ISREG(st.st_mode)) { ret = zfs_rewrite_file(path, verbose, args); } else if (S_ISDIR(st.st_mode) && recurse) { ret = zfs_rewrite_dir(path, verbose, xdev, st.st_dev, args, dirs); } return (ret); } static int zfs_do_rewrite(int argc, char **argv) { int ret = 0, err, c; boolean_t recurse = B_FALSE, verbose = B_FALSE, xdev = B_FALSE; if (argc < 2) usage(B_FALSE); zfs_rewrite_args_t args; memset(&args, 0, sizeof (args)); while ((c = getopt(argc, argv, "Pl:o:rvx")) != -1) { switch (c) { case 'P': args.flags |= ZFS_REWRITE_PHYSICAL; break; case 'l': args.len = strtoll(optarg, NULL, 0); break; case 'o': args.off = strtoll(optarg, NULL, 0); break; case 'r': recurse = B_TRUE; break; case 'v': verbose = B_TRUE; break; case 'x': xdev = B_TRUE; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argv += optind; argc -= optind; if (argc == 0) { (void) fprintf(stderr, gettext("missing file or directory target(s)\n")); usage(B_FALSE); } nvlist_t *dirs = fnvlist_alloc(); for (int i = 0; i < argc; i++) { err = zfs_rewrite_path(argv[i], verbose, recurse, xdev, &args, dirs); if (err) ret = err; } nvpair_t *dir; while ((dir = nvlist_next_nvpair(dirs, NULL)) != NULL) { err = zfs_rewrite_dir(nvpair_name(dir), verbose, xdev, fnvpair_value_uint64(dir), &args, dirs); if (err) ret = err; fnvlist_remove_nvpair(dirs, dir); } fnvlist_free(dirs); return (ret); } static int zfs_do_wait(int argc, char **argv) { boolean_t enabled[ZFS_WAIT_NUM_ACTIVITIES]; int error = 0, i; int c; /* By default, wait for all types of activity. */ for (i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) enabled[i] = B_TRUE; while ((c = getopt(argc, argv, "t:")) != -1) { switch (c) { case 't': /* Reset activities array */ memset(&enabled, 0, sizeof (enabled)); for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const col_subopts[ ZFS_WAIT_NUM_ACTIVITIES] = { "deleteq" }; for (i = 0; i < ARRAY_SIZE(col_subopts); ++i) if (strcmp(tok, col_subopts[i]) == 0) { enabled[i] = B_TRUE; goto found; } (void) fprintf(stderr, gettext("invalid activity '%s'\n"), tok); usage(B_FALSE); found:; } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argv += optind; argc -= optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing 'filesystem' " "argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } zfs_handle_t *zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM); if (zhp == NULL) return (1); for (;;) { boolean_t missing = B_FALSE; boolean_t any_waited = B_FALSE; for (int i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) { boolean_t waited; if (!enabled[i]) continue; error = zfs_wait_status(zhp, i, &missing, &waited); if (error != 0 || missing) break; any_waited = (any_waited || waited); } if (error != 0 || missing || !any_waited) break; } zfs_close(zhp); return (error); } /* * Display version message */ static int zfs_do_version(int argc, char **argv) { int c; nvlist_t *jsobj = NULL, *zfs_ver = NULL; boolean_t json = B_FALSE; struct option long_options[] = { {"json", no_argument, NULL, 'j'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "j", long_options, NULL)) != -1) { switch (c) { case 'j': json = B_TRUE; jsobj = zfs_json_schema(0, 1); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; if (argc != 0) { (void) fprintf(stderr, "too many arguments\n"); usage(B_FALSE); } if (json) { zfs_ver = zfs_version_nvlist(); if (zfs_ver) { fnvlist_add_nvlist(jsobj, "zfs_version", zfs_ver); zcmd_print_json(jsobj); fnvlist_free(zfs_ver); return (0); } else return (-1); } else return (zfs_version_print() != 0); } /* Display documentation */ static int zfs_do_help(int argc, char **argv) { char page[MAXNAMELEN]; if (argc < 3 || strcmp(argv[2], "zfs") == 0) strcpy(page, "zfs"); else if (strcmp(argv[2], "concepts") == 0 || strcmp(argv[2], "props") == 0) snprintf(page, sizeof (page), "zfs%s", argv[2]); else snprintf(page, sizeof (page), "zfs-%s", argv[2]); execlp("man", "man", page, NULL); fprintf(stderr, "couldn't run man program: %s", strerror(errno)); return (-1); } int main(int argc, char **argv) { int ret = 0; int i = 0; const char *cmdname; char **newargv; (void) setlocale(LC_ALL, ""); (void) setlocale(LC_NUMERIC, "C"); (void) textdomain(TEXT_DOMAIN); opterr = 0; /* * Make sure the user has specified some command. */ if (argc < 2) { (void) fprintf(stderr, gettext("missing command\n")); usage(B_FALSE); } cmdname = argv[1]; /* * The 'umount' command is an alias for 'unmount' */ if (strcmp(cmdname, "umount") == 0) cmdname = "unmount"; /* * The 'recv' command is an alias for 'receive' */ if (strcmp(cmdname, "recv") == 0) cmdname = "receive"; /* * The 'snap' command is an alias for 'snapshot' */ if (strcmp(cmdname, "snap") == 0) cmdname = "snapshot"; /* * Special case '-?' */ if ((strcmp(cmdname, "-?") == 0) || (strcmp(cmdname, "--help") == 0)) usage(B_TRUE); /* * Special case '-V|--version' */ if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) return (zfs_version_print() != 0); /* * Special case 'help' */ if (strcmp(cmdname, "help") == 0) return (zfs_do_help(argc, argv)); if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (1); } zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); libzfs_print_on_error(g_zfs, B_TRUE); zfs_setproctitle_init(argc, argv, environ); /* * Many commands modify input strings for string parsing reasons. * We create a copy to protect the original argv. */ newargv = safe_malloc((argc + 1) * sizeof (newargv[0])); for (i = 0; i < argc; i++) newargv[i] = strdup(argv[i]); newargv[argc] = NULL; /* * Run the appropriate command. */ libzfs_mnttab_cache(g_zfs, B_TRUE); if (find_command_idx(cmdname, &i) == 0) { current_command = &command_table[i]; ret = command_table[i].func(argc - 1, newargv + 1); } else if (strchr(cmdname, '=') != NULL) { verify(find_command_idx("set", &i) == 0); current_command = &command_table[i]; ret = command_table[i].func(argc, newargv); } else { (void) fprintf(stderr, gettext("unrecognized " "command '%s'\n"), cmdname); usage(B_FALSE); ret = 1; } for (i = 0; i < argc; i++) free(newargv[i]); free(newargv); if (ret == 0 && log_history) (void) zpool_log_history(g_zfs, history_str); libzfs_fini(g_zfs); /* * The 'ZFS_ABORT' environment variable causes us to dump core on exit * for the purposes of running ::findleaks. */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } return (ret); } /* * zfs zone nsfile filesystem * * Add or delete the given dataset to/from the namespace. */ #ifdef __linux__ static int zfs_do_zone_impl(int argc, char **argv, boolean_t attach) { zfs_handle_t *zhp; int ret; if (argc < 3) { (void) fprintf(stderr, gettext("missing argument(s)\n")); usage(B_FALSE); } if (argc > 3) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM); if (zhp == NULL) return (1); ret = (zfs_userns(zhp, argv[1], attach) != 0); zfs_close(zhp); return (ret); } static int zfs_do_zone(int argc, char **argv) { return (zfs_do_zone_impl(argc, argv, B_TRUE)); } static int zfs_do_unzone(int argc, char **argv) { return (zfs_do_zone_impl(argc, argv, B_FALSE)); } #endif #ifdef __FreeBSD__ #include #include /* * Attach/detach the given dataset to/from the given jail */ static int zfs_do_jail_impl(int argc, char **argv, boolean_t attach) { zfs_handle_t *zhp; int jailid, ret; /* check number of arguments */ if (argc < 3) { (void) fprintf(stderr, gettext("missing argument(s)\n")); usage(B_FALSE); } if (argc > 3) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } jailid = jail_getid(argv[1]); if (jailid < 0) { (void) fprintf(stderr, gettext("invalid jail id or name\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM); if (zhp == NULL) return (1); ret = (zfs_jail(zhp, jailid, attach) != 0); zfs_close(zhp); return (ret); } /* * zfs jail jailid filesystem * * Attach the given dataset to the given jail */ static int zfs_do_jail(int argc, char **argv) { return (zfs_do_jail_impl(argc, argv, B_TRUE)); } /* * zfs unjail jailid filesystem * * Detach the given dataset from the given jail */ static int zfs_do_unjail(int argc, char **argv) { return (zfs_do_jail_impl(argc, argv, B_FALSE)); } #endif diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 49ab9d3db795..662fd81c5ee1 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1,2006 +1,2008 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014, 2016, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019 Datto Inc. * Portions Copyright 2010 Robert Milkowski * Copyright (c) 2021, Colm Buckley * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. * Copyright (c) 2024, Klara, Inc. */ #ifndef _SYS_FS_ZFS_H #define _SYS_FS_ZFS_H extern __attribute__((visibility("default"))) #include #ifdef __cplusplus extern "C" { #endif /* * Types and constants shared between userland and the kernel. */ /* * Each dataset can be one of the following types. These constants can be * combined into masks that can be passed to various functions. */ typedef enum { ZFS_TYPE_INVALID = 0, ZFS_TYPE_FILESYSTEM = (1 << 0), ZFS_TYPE_SNAPSHOT = (1 << 1), ZFS_TYPE_VOLUME = (1 << 2), ZFS_TYPE_POOL = (1 << 3), ZFS_TYPE_BOOKMARK = (1 << 4), ZFS_TYPE_VDEV = (1 << 5), } zfs_type_t; /* * NB: lzc_dataset_type should be updated whenever a new objset type is added, * if it represents a real type of a dataset that can be created from userland. */ typedef enum dmu_objset_type { DMU_OST_NONE, DMU_OST_META, DMU_OST_ZFS, DMU_OST_ZVOL, DMU_OST_OTHER, /* For testing only! */ DMU_OST_ANY, /* Be careful! */ DMU_OST_NUMTYPES } dmu_objset_type_t; #define ZFS_TYPE_DATASET \ (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT) /* * All of these include the terminating NUL byte. */ #define ZAP_MAXNAMELEN 256 #define ZAP_MAXNAMELEN_NEW 1024 #define ZAP_MAXVALUELEN (1024 * 8) #define ZAP_OLDMAXVALUELEN 1024 #define ZFS_MAX_DATASET_NAME_LEN 256 /* * Dataset properties are identified by these constants and must be added to * the end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in module/zcommon/zfs_prop.c. */ typedef enum { ZPROP_CONT = -2, ZPROP_INVAL = -1, ZPROP_USERPROP = ZPROP_INVAL, ZFS_PROP_TYPE = 0, ZFS_PROP_CREATION, ZFS_PROP_USED, ZFS_PROP_AVAILABLE, ZFS_PROP_REFERENCED, ZFS_PROP_COMPRESSRATIO, ZFS_PROP_MOUNTED, ZFS_PROP_ORIGIN, ZFS_PROP_QUOTA, ZFS_PROP_RESERVATION, ZFS_PROP_VOLSIZE, ZFS_PROP_VOLBLOCKSIZE, ZFS_PROP_RECORDSIZE, ZFS_PROP_MOUNTPOINT, ZFS_PROP_SHARENFS, ZFS_PROP_CHECKSUM, ZFS_PROP_COMPRESSION, ZFS_PROP_ATIME, ZFS_PROP_DEVICES, ZFS_PROP_EXEC, ZFS_PROP_SETUID, ZFS_PROP_READONLY, ZFS_PROP_ZONED, ZFS_PROP_SNAPDIR, ZFS_PROP_ACLMODE, ZFS_PROP_ACLINHERIT, ZFS_PROP_CREATETXG, ZFS_PROP_NAME, /* not exposed to the user */ ZFS_PROP_CANMOUNT, ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */ ZFS_PROP_XATTR, ZFS_PROP_NUMCLONES, /* not exposed to the user */ ZFS_PROP_COPIES, ZFS_PROP_VERSION, ZFS_PROP_UTF8ONLY, ZFS_PROP_NORMALIZE, ZFS_PROP_CASE, ZFS_PROP_VSCAN, ZFS_PROP_NBMAND, ZFS_PROP_SHARESMB, ZFS_PROP_REFQUOTA, ZFS_PROP_REFRESERVATION, ZFS_PROP_GUID, ZFS_PROP_PRIMARYCACHE, ZFS_PROP_SECONDARYCACHE, ZFS_PROP_USEDSNAP, ZFS_PROP_USEDDS, ZFS_PROP_USEDCHILD, ZFS_PROP_USEDREFRESERV, ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */ ZFS_PROP_DEFER_DESTROY, ZFS_PROP_USERREFS, ZFS_PROP_LOGBIAS, ZFS_PROP_UNIQUE, /* not exposed to the user */ ZFS_PROP_OBJSETID, ZFS_PROP_DEDUP, ZFS_PROP_MLSLABEL, ZFS_PROP_SYNC, ZFS_PROP_DNODESIZE, ZFS_PROP_REFRATIO, ZFS_PROP_WRITTEN, ZFS_PROP_CLONES, ZFS_PROP_LOGICALUSED, ZFS_PROP_LOGICALREFERENCED, ZFS_PROP_INCONSISTENT, /* not exposed to the user */ ZFS_PROP_VOLMODE, ZFS_PROP_FILESYSTEM_LIMIT, ZFS_PROP_SNAPSHOT_LIMIT, ZFS_PROP_FILESYSTEM_COUNT, ZFS_PROP_SNAPSHOT_COUNT, ZFS_PROP_SNAPDEV, ZFS_PROP_ACLTYPE, ZFS_PROP_SELINUX_CONTEXT, ZFS_PROP_SELINUX_FSCONTEXT, ZFS_PROP_SELINUX_DEFCONTEXT, ZFS_PROP_SELINUX_ROOTCONTEXT, ZFS_PROP_RELATIME, ZFS_PROP_REDUNDANT_METADATA, ZFS_PROP_OVERLAY, ZFS_PROP_PREV_SNAP, ZFS_PROP_RECEIVE_RESUME_TOKEN, ZFS_PROP_ENCRYPTION, ZFS_PROP_KEYLOCATION, ZFS_PROP_KEYFORMAT, ZFS_PROP_PBKDF2_SALT, ZFS_PROP_PBKDF2_ITERS, ZFS_PROP_ENCRYPTION_ROOT, ZFS_PROP_KEY_GUID, ZFS_PROP_KEYSTATUS, ZFS_PROP_REMAPTXG, /* obsolete - no longer used */ ZFS_PROP_SPECIAL_SMALL_BLOCKS, ZFS_PROP_IVSET_GUID, /* not exposed to the user */ ZFS_PROP_REDACTED, ZFS_PROP_REDACT_SNAPS, ZFS_PROP_SNAPSHOTS_CHANGED, ZFS_PROP_PREFETCH, ZFS_PROP_VOLTHREADING, ZFS_PROP_DIRECT, ZFS_PROP_LONGNAME, ZFS_PROP_DEFAULTUSERQUOTA, ZFS_PROP_DEFAULTGROUPQUOTA, ZFS_PROP_DEFAULTPROJECTQUOTA, ZFS_PROP_DEFAULTUSEROBJQUOTA, ZFS_PROP_DEFAULTGROUPOBJQUOTA, ZFS_PROP_DEFAULTPROJECTOBJQUOTA, ZFS_NUM_PROPS } zfs_prop_t; typedef enum { ZFS_PROP_USERUSED, ZFS_PROP_USERQUOTA, ZFS_PROP_GROUPUSED, ZFS_PROP_GROUPQUOTA, ZFS_PROP_USEROBJUSED, ZFS_PROP_USEROBJQUOTA, ZFS_PROP_GROUPOBJUSED, ZFS_PROP_GROUPOBJQUOTA, ZFS_PROP_PROJECTUSED, ZFS_PROP_PROJECTQUOTA, ZFS_PROP_PROJECTOBJUSED, ZFS_PROP_PROJECTOBJQUOTA, ZFS_NUM_USERQUOTA_PROPS } zfs_userquota_prop_t; _SYS_FS_ZFS_H const char *const zfs_userquota_prop_prefixes[ ZFS_NUM_USERQUOTA_PROPS]; /* * Pool properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected * by the change. Properties must be registered in zfs_prop_init(). */ typedef enum { ZPOOL_PROP_INVAL = -1, ZPOOL_PROP_NAME, ZPOOL_PROP_SIZE, ZPOOL_PROP_CAPACITY, ZPOOL_PROP_ALTROOT, ZPOOL_PROP_HEALTH, ZPOOL_PROP_GUID, ZPOOL_PROP_VERSION, ZPOOL_PROP_BOOTFS, ZPOOL_PROP_DELEGATION, ZPOOL_PROP_AUTOREPLACE, ZPOOL_PROP_CACHEFILE, ZPOOL_PROP_FAILUREMODE, ZPOOL_PROP_LISTSNAPS, ZPOOL_PROP_AUTOEXPAND, ZPOOL_PROP_DEDUPDITTO, ZPOOL_PROP_DEDUPRATIO, ZPOOL_PROP_FREE, ZPOOL_PROP_ALLOCATED, ZPOOL_PROP_READONLY, ZPOOL_PROP_ASHIFT, ZPOOL_PROP_COMMENT, ZPOOL_PROP_EXPANDSZ, ZPOOL_PROP_FREEING, ZPOOL_PROP_FRAGMENTATION, ZPOOL_PROP_LEAKED, ZPOOL_PROP_MAXBLOCKSIZE, ZPOOL_PROP_TNAME, ZPOOL_PROP_MAXDNODESIZE, ZPOOL_PROP_MULTIHOST, ZPOOL_PROP_CHECKPOINT, ZPOOL_PROP_LOAD_GUID, ZPOOL_PROP_AUTOTRIM, ZPOOL_PROP_COMPATIBILITY, ZPOOL_PROP_BCLONEUSED, ZPOOL_PROP_BCLONESAVED, ZPOOL_PROP_BCLONERATIO, ZPOOL_PROP_DEDUP_TABLE_SIZE, ZPOOL_PROP_DEDUP_TABLE_QUOTA, ZPOOL_PROP_DEDUPCACHED, ZPOOL_PROP_LAST_SCRUBBED_TXG, ZPOOL_NUM_PROPS } zpool_prop_t; /* Small enough to not hog a whole line of printout in zpool(8). */ #define ZPROP_MAX_COMMENT 32 #define ZPROP_BOOLEAN_NA 2 #define ZPROP_VALUE "value" #define ZPROP_SOURCE "source" typedef enum { ZPROP_SRC_NONE = 0x1, ZPROP_SRC_DEFAULT = 0x2, ZPROP_SRC_TEMPORARY = 0x4, ZPROP_SRC_LOCAL = 0x8, ZPROP_SRC_INHERITED = 0x10, ZPROP_SRC_RECEIVED = 0x20 } zprop_source_t; #define ZPROP_SRC_ALL 0x3f #define ZPROP_SOURCE_VAL_RECVD "$recvd" #define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS" /* * Dataset flag implemented as a special entry in the props zap object * indicating that the dataset has received properties on or after * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties * just as it did in earlier versions, and thereafter, local properties are * preserved. */ #define ZPROP_HAS_RECVD "$hasrecvd" typedef enum { ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */ ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */ } zprop_errflags_t; typedef int (*zprop_func)(int, void *); /* * Properties to be set on the root file system of a new pool * are stuffed into their own nvlist, which is then included in * the properties nvlist with the pool properties. */ #define ZPOOL_ROOTFS_PROPS "root-props-nvl" /* * Length of 'written@' and 'written#' */ #define ZFS_WRITTEN_PROP_PREFIX_LEN 8 /* * VDEV properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in usr/src/common/zfs/zpool_prop.c. */ typedef enum { VDEV_PROP_INVAL = -1, VDEV_PROP_USERPROP = VDEV_PROP_INVAL, VDEV_PROP_NAME, VDEV_PROP_CAPACITY, VDEV_PROP_STATE, VDEV_PROP_GUID, VDEV_PROP_ASIZE, VDEV_PROP_PSIZE, VDEV_PROP_ASHIFT, VDEV_PROP_SIZE, VDEV_PROP_FREE, VDEV_PROP_ALLOCATED, VDEV_PROP_COMMENT, VDEV_PROP_EXPANDSZ, VDEV_PROP_FRAGMENTATION, VDEV_PROP_BOOTSIZE, VDEV_PROP_PARITY, VDEV_PROP_PATH, VDEV_PROP_DEVID, VDEV_PROP_PHYS_PATH, VDEV_PROP_ENC_PATH, VDEV_PROP_FRU, VDEV_PROP_PARENT, VDEV_PROP_CHILDREN, VDEV_PROP_NUMCHILDREN, VDEV_PROP_READ_ERRORS, VDEV_PROP_WRITE_ERRORS, VDEV_PROP_CHECKSUM_ERRORS, VDEV_PROP_INITIALIZE_ERRORS, VDEV_PROP_OPS_NULL, VDEV_PROP_OPS_READ, VDEV_PROP_OPS_WRITE, VDEV_PROP_OPS_FREE, VDEV_PROP_OPS_CLAIM, VDEV_PROP_OPS_TRIM, VDEV_PROP_BYTES_NULL, VDEV_PROP_BYTES_READ, VDEV_PROP_BYTES_WRITE, VDEV_PROP_BYTES_FREE, VDEV_PROP_BYTES_CLAIM, VDEV_PROP_BYTES_TRIM, VDEV_PROP_REMOVING, VDEV_PROP_ALLOCATING, VDEV_PROP_FAILFAST, VDEV_PROP_CHECKSUM_N, VDEV_PROP_CHECKSUM_T, VDEV_PROP_IO_N, VDEV_PROP_IO_T, VDEV_PROP_RAIDZ_EXPANDING, VDEV_PROP_SLOW_IO_N, VDEV_PROP_SLOW_IO_T, VDEV_PROP_TRIM_SUPPORT, VDEV_PROP_TRIM_ERRORS, VDEV_PROP_SLOW_IOS, VDEV_PROP_SIT_OUT, VDEV_PROP_AUTOSIT, VDEV_NUM_PROPS } vdev_prop_t; /* * Dataset property functions shared between libzfs and kernel. */ _SYS_FS_ZFS_H const char *zfs_prop_default_string(zfs_prop_t); _SYS_FS_ZFS_H uint64_t zfs_prop_default_numeric(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_readonly(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_visible(zfs_prop_t prop); _SYS_FS_ZFS_H boolean_t zfs_prop_inheritable(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_setonce(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_encryption_key_param(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_valid_keylocation(const char *, boolean_t); _SYS_FS_ZFS_H const char *zfs_prop_to_name(zfs_prop_t); _SYS_FS_ZFS_H zfs_prop_t zfs_name_to_prop(const char *); _SYS_FS_ZFS_H boolean_t zfs_prop_user(const char *); _SYS_FS_ZFS_H boolean_t zfs_prop_userquota(const char *); _SYS_FS_ZFS_H boolean_t zfs_prop_written(const char *); _SYS_FS_ZFS_H int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); _SYS_FS_ZFS_H int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); _SYS_FS_ZFS_H uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed); _SYS_FS_ZFS_H boolean_t zfs_prop_valid_for_type(int, zfs_type_t, boolean_t); /* * Pool property functions shared between libzfs and kernel. */ _SYS_FS_ZFS_H zpool_prop_t zpool_name_to_prop(const char *); _SYS_FS_ZFS_H const char *zpool_prop_to_name(zpool_prop_t); _SYS_FS_ZFS_H const char *zpool_prop_default_string(zpool_prop_t); _SYS_FS_ZFS_H uint64_t zpool_prop_default_numeric(zpool_prop_t); _SYS_FS_ZFS_H boolean_t zpool_prop_readonly(zpool_prop_t); _SYS_FS_ZFS_H boolean_t zpool_prop_setonce(zpool_prop_t); _SYS_FS_ZFS_H boolean_t zpool_prop_feature(const char *); _SYS_FS_ZFS_H boolean_t zpool_prop_unsupported(const char *); _SYS_FS_ZFS_H int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); _SYS_FS_ZFS_H int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); _SYS_FS_ZFS_H uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); /* * VDEV property functions shared between libzfs and kernel. */ _SYS_FS_ZFS_H vdev_prop_t vdev_name_to_prop(const char *); _SYS_FS_ZFS_H boolean_t vdev_prop_user(const char *name); _SYS_FS_ZFS_H const char *vdev_prop_to_name(vdev_prop_t); _SYS_FS_ZFS_H const char *vdev_prop_default_string(vdev_prop_t); _SYS_FS_ZFS_H uint64_t vdev_prop_default_numeric(vdev_prop_t); _SYS_FS_ZFS_H boolean_t vdev_prop_readonly(vdev_prop_t prop); _SYS_FS_ZFS_H int vdev_prop_index_to_string(vdev_prop_t, uint64_t, const char **); _SYS_FS_ZFS_H int vdev_prop_string_to_index(vdev_prop_t, const char *, uint64_t *); _SYS_FS_ZFS_H boolean_t zpool_prop_vdev(const char *name); _SYS_FS_ZFS_H uint64_t vdev_prop_random_value(vdev_prop_t prop, uint64_t seed); /* * Definitions for the Delegation. */ typedef enum { ZFS_DELEG_WHO_UNKNOWN = 0, ZFS_DELEG_USER = 'u', ZFS_DELEG_USER_SETS = 'U', ZFS_DELEG_GROUP = 'g', ZFS_DELEG_GROUP_SETS = 'G', ZFS_DELEG_EVERYONE = 'e', ZFS_DELEG_EVERYONE_SETS = 'E', ZFS_DELEG_CREATE = 'c', ZFS_DELEG_CREATE_SETS = 'C', ZFS_DELEG_NAMED_SET = 's', ZFS_DELEG_NAMED_SET_SETS = 'S' } zfs_deleg_who_type_t; typedef enum { ZFS_DELEG_NONE = 0, ZFS_DELEG_PERM_LOCAL = 1, ZFS_DELEG_PERM_DESCENDENT = 2, ZFS_DELEG_PERM_LOCALDESCENDENT = 3, ZFS_DELEG_PERM_CREATE = 4 } zfs_deleg_inherit_t; #define ZFS_DELEG_PERM_UID "uid" #define ZFS_DELEG_PERM_GID "gid" #define ZFS_DELEG_PERM_GROUPS "groups" #define ZFS_MLSLABEL_DEFAULT "none" #define ZFS_SMB_ACL_SRC "src" #define ZFS_SMB_ACL_TARGET "target" typedef enum { ZFS_CANMOUNT_OFF = 0, ZFS_CANMOUNT_ON = 1, ZFS_CANMOUNT_NOAUTO = 2 } zfs_canmount_type_t; typedef enum { ZFS_LOGBIAS_LATENCY = 0, ZFS_LOGBIAS_THROUGHPUT = 1 } zfs_logbias_op_t; typedef enum zfs_share_op { ZFS_SHARE_NFS = 0, ZFS_UNSHARE_NFS = 1, ZFS_SHARE_SMB = 2, ZFS_UNSHARE_SMB = 3 } zfs_share_op_t; typedef enum zfs_smb_acl_op { ZFS_SMB_ACL_ADD, ZFS_SMB_ACL_REMOVE, ZFS_SMB_ACL_RENAME, ZFS_SMB_ACL_PURGE } zfs_smb_acl_op_t; typedef enum zfs_cache_type { ZFS_CACHE_NONE = 0, ZFS_CACHE_METADATA = 1, ZFS_CACHE_ALL = 2 } zfs_cache_type_t; typedef enum { ZFS_SYNC_STANDARD = 0, ZFS_SYNC_ALWAYS = 1, ZFS_SYNC_DISABLED = 2 } zfs_sync_type_t; typedef enum { ZFS_XATTR_OFF = 0, ZFS_XATTR_DIR = 1, ZFS_XATTR_SA = 2 } zfs_xattr_type_t; typedef enum { ZFS_DNSIZE_LEGACY = 0, ZFS_DNSIZE_AUTO = 1, ZFS_DNSIZE_1K = 1024, ZFS_DNSIZE_2K = 2048, ZFS_DNSIZE_4K = 4096, ZFS_DNSIZE_8K = 8192, ZFS_DNSIZE_16K = 16384 } zfs_dnsize_type_t; typedef enum { ZFS_REDUNDANT_METADATA_ALL, ZFS_REDUNDANT_METADATA_MOST, ZFS_REDUNDANT_METADATA_SOME, ZFS_REDUNDANT_METADATA_NONE } zfs_redundant_metadata_type_t; typedef enum { ZFS_VOLMODE_DEFAULT = 0, ZFS_VOLMODE_GEOM = 1, ZFS_VOLMODE_DEV = 2, ZFS_VOLMODE_NONE = 3 } zfs_volmode_t; typedef enum { ZFS_DIRECT_DISABLED = 0, ZFS_DIRECT_STANDARD, ZFS_DIRECT_ALWAYS } zfs_direct_t; typedef enum zfs_keystatus { ZFS_KEYSTATUS_NONE = 0, ZFS_KEYSTATUS_UNAVAILABLE, ZFS_KEYSTATUS_AVAILABLE, } zfs_keystatus_t; typedef enum zfs_keyformat { ZFS_KEYFORMAT_NONE = 0, ZFS_KEYFORMAT_RAW, ZFS_KEYFORMAT_HEX, ZFS_KEYFORMAT_PASSPHRASE, ZFS_KEYFORMAT_FORMATS } zfs_keyformat_t; typedef enum zfs_key_location { ZFS_KEYLOCATION_NONE = 0, ZFS_KEYLOCATION_PROMPT, ZFS_KEYLOCATION_URI, ZFS_KEYLOCATION_LOCATIONS } zfs_keylocation_t; typedef enum { ZFS_PREFETCH_NONE = 0, ZFS_PREFETCH_METADATA = 1, ZFS_PREFETCH_ALL = 2 } zfs_prefetch_type_t; #define DEFAULT_PBKDF2_ITERATIONS 350000 #define MIN_PBKDF2_ITERATIONS 100000 /* * On-disk version number. */ #define SPA_VERSION_1 1ULL #define SPA_VERSION_2 2ULL #define SPA_VERSION_3 3ULL #define SPA_VERSION_4 4ULL #define SPA_VERSION_5 5ULL #define SPA_VERSION_6 6ULL #define SPA_VERSION_7 7ULL #define SPA_VERSION_8 8ULL #define SPA_VERSION_9 9ULL #define SPA_VERSION_10 10ULL #define SPA_VERSION_11 11ULL #define SPA_VERSION_12 12ULL #define SPA_VERSION_13 13ULL #define SPA_VERSION_14 14ULL #define SPA_VERSION_15 15ULL #define SPA_VERSION_16 16ULL #define SPA_VERSION_17 17ULL #define SPA_VERSION_18 18ULL #define SPA_VERSION_19 19ULL #define SPA_VERSION_20 20ULL #define SPA_VERSION_21 21ULL #define SPA_VERSION_22 22ULL #define SPA_VERSION_23 23ULL #define SPA_VERSION_24 24ULL #define SPA_VERSION_25 25ULL #define SPA_VERSION_26 26ULL #define SPA_VERSION_27 27ULL #define SPA_VERSION_28 28ULL #define SPA_VERSION_5000 5000ULL /* * The incrementing pool version number has been replaced by pool feature * flags. For more details, see zfeature.c. */ #define SPA_VERSION SPA_VERSION_5000 #define SPA_VERSION_STRING "5000" /* * Symbolic names for the changes that caused a SPA_VERSION switch. * Used in the code when checking for presence or absence of a feature. * Feel free to define multiple symbolic names for each version if there * were multiple changes to on-disk structures during that version. * * NOTE: When checking the current SPA_VERSION in your code, be sure * to use spa_version() since it reports the version of the * last synced uberblock. Checking the in-flight version can * be dangerous in some cases. */ #define SPA_VERSION_INITIAL SPA_VERSION_1 #define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2 #define SPA_VERSION_SPARES SPA_VERSION_3 #define SPA_VERSION_RAIDZ2 SPA_VERSION_3 #define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3 #define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3 #define SPA_VERSION_DNODE_BYTES SPA_VERSION_3 #define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4 #define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5 #define SPA_VERSION_BOOTFS SPA_VERSION_6 #define SPA_VERSION_SLOGS SPA_VERSION_7 #define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8 #define SPA_VERSION_FUID SPA_VERSION_9 #define SPA_VERSION_REFRESERVATION SPA_VERSION_9 #define SPA_VERSION_REFQUOTA SPA_VERSION_9 #define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9 #define SPA_VERSION_L2CACHE SPA_VERSION_10 #define SPA_VERSION_NEXT_CLONES SPA_VERSION_11 #define SPA_VERSION_ORIGIN SPA_VERSION_11 #define SPA_VERSION_DSL_SCRUB SPA_VERSION_11 #define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 #define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 #define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 #define SPA_VERSION_USERSPACE SPA_VERSION_15 #define SPA_VERSION_STMF_PROP SPA_VERSION_16 #define SPA_VERSION_RAIDZ3 SPA_VERSION_17 #define SPA_VERSION_USERREFS SPA_VERSION_18 #define SPA_VERSION_HOLES SPA_VERSION_19 #define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20 #define SPA_VERSION_DEDUP SPA_VERSION_21 #define SPA_VERSION_RECVD_PROPS SPA_VERSION_22 #define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 #define SPA_VERSION_SA SPA_VERSION_24 #define SPA_VERSION_SCAN SPA_VERSION_25 #define SPA_VERSION_DIR_CLONES SPA_VERSION_26 #define SPA_VERSION_DEADLISTS SPA_VERSION_26 #define SPA_VERSION_FAST_SNAP SPA_VERSION_27 #define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28 #define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28 #define SPA_VERSION_FEATURES SPA_VERSION_5000 #define SPA_VERSION_IS_SUPPORTED(v) \ (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \ ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION)) /* * ZPL version - rev'd whenever an incompatible on-disk format change * occurs. This is independent of SPA/DMU/ZAP versioning. You must * also update the version_table[] and help message in zfs_prop.c. */ #define ZPL_VERSION_1 1ULL #define ZPL_VERSION_2 2ULL #define ZPL_VERSION_3 3ULL #define ZPL_VERSION_4 4ULL #define ZPL_VERSION_5 5ULL #define ZPL_VERSION ZPL_VERSION_5 #define ZPL_VERSION_STRING "5" #define ZPL_VERSION_INITIAL ZPL_VERSION_1 #define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 #define ZPL_VERSION_FUID ZPL_VERSION_3 #define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 #define ZPL_VERSION_SYSATTR ZPL_VERSION_3 #define ZPL_VERSION_USERSPACE ZPL_VERSION_4 #define ZPL_VERSION_SA ZPL_VERSION_5 /* Persistent L2ARC version */ #define L2ARC_PERSISTENT_VERSION_1 1ULL #define L2ARC_PERSISTENT_VERSION L2ARC_PERSISTENT_VERSION_1 #define L2ARC_PERSISTENT_VERSION_STRING "1" /* Rewind policy information */ #define ZPOOL_NO_REWIND 1 /* No policy - default behavior */ #define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */ #define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */ #define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */ #define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */ #define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */ #define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */ typedef struct zpool_load_policy { uint32_t zlp_rewind; /* rewind policy requested */ uint64_t zlp_maxmeta; /* max acceptable meta-data errors */ uint64_t zlp_maxdata; /* max acceptable data errors */ uint64_t zlp_txg; /* specific txg to load */ } zpool_load_policy_t; /* * The following are configuration names used in the nvlist describing a pool's * configuration. New on-disk names should be prefixed with ":" * (e.g. "org.openzfs:") to avoid conflicting names being developed * independently. */ #define ZPOOL_CONFIG_VERSION "version" #define ZPOOL_CONFIG_POOL_NAME "name" #define ZPOOL_CONFIG_POOL_STATE "state" #define ZPOOL_CONFIG_POOL_TXG "txg" #define ZPOOL_CONFIG_POOL_GUID "pool_guid" #define ZPOOL_CONFIG_CREATE_TXG "create_txg" #define ZPOOL_CONFIG_TOP_GUID "top_guid" #define ZPOOL_CONFIG_VDEV_TREE "vdev_tree" #define ZPOOL_CONFIG_TYPE "type" #define ZPOOL_CONFIG_CHILDREN "children" #define ZPOOL_CONFIG_ID "id" #define ZPOOL_CONFIG_GUID "guid" #define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object" #define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births" #define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev" #define ZPOOL_CONFIG_PATH "path" #define ZPOOL_CONFIG_DEVID "devid" #define ZPOOL_CONFIG_SPARE_ID "spareid" #define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" #define ZPOOL_CONFIG_ASIZE "asize" +#define ZPOOL_CONFIG_MIN_ALLOC "min_alloc" +#define ZPOOL_CONFIG_MAX_ALLOC "max_alloc" #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ #define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */ #define ZPOOL_CONFIG_RAIDZ_EXPAND_STATS "raidz_expand_stats" /* not on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ #define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ /* container nvlist of extended stats */ #define ZPOOL_CONFIG_VDEV_STATS_EX "vdev_stats_ex" /* Active queue read/write stats */ #define ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE "vdev_sync_r_active_queue" #define ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE "vdev_sync_w_active_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE "vdev_async_r_active_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue" #define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue" #define ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE "vdev_async_trim_active_queue" #define ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE "vdev_rebuild_active_queue" /* Queue sizes */ #define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue" #define ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE "vdev_sync_w_pend_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE "vdev_async_r_pend_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue" #define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue" #define ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE "vdev_async_trim_pend_queue" #define ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE "vdev_rebuild_pend_queue" /* Latency read/write histogram stats */ #define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo" #define ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO "vdev_tot_w_lat_histo" #define ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO "vdev_disk_r_lat_histo" #define ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO "vdev_disk_w_lat_histo" #define ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO "vdev_sync_r_lat_histo" #define ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO "vdev_sync_w_lat_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO "vdev_async_r_lat_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO "vdev_async_w_lat_histo" #define ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO "vdev_scrub_histo" #define ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO "vdev_trim_histo" #define ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO "vdev_rebuild_histo" /* Request size histograms */ #define ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO "vdev_sync_ind_r_histo" #define ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO "vdev_sync_ind_w_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO "vdev_async_ind_r_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO "vdev_async_ind_w_histo" #define ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO "vdev_ind_scrub_histo" #define ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO "vdev_ind_trim_histo" #define ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO "vdev_ind_rebuild_histo" #define ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO "vdev_sync_agg_r_histo" #define ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO "vdev_sync_agg_w_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO "vdev_async_agg_r_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo" #define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo" #define ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO "vdev_agg_trim_histo" #define ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO "vdev_agg_rebuild_histo" /* Number of slow IOs */ #define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios" /* Number of Direct I/O write verify errors */ #define ZPOOL_CONFIG_VDEV_DIO_VERIFY_ERRORS "vdev_dio_verify_errors" /* vdev enclosure sysfs path */ #define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path" #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" #define ZPOOL_CONFIG_ERRCOUNT "error_count" #define ZPOOL_CONFIG_NOT_PRESENT "not_present" #define ZPOOL_CONFIG_SPARES "spares" #define ZPOOL_CONFIG_IS_SPARE "is_spare" #define ZPOOL_CONFIG_NPARITY "nparity" #define ZPOOL_CONFIG_RAIDZ_EXPANDING "raidz_expanding" #define ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS "raidz_expand_txgs" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" #define ZPOOL_CONFIG_UNSPARE "unspare" #define ZPOOL_CONFIG_PHYS_PATH "phys_path" #define ZPOOL_CONFIG_IS_LOG "is_log" #define ZPOOL_CONFIG_L2CACHE "l2cache" #define ZPOOL_CONFIG_HOLE_ARRAY "hole_array" #define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children" #define ZPOOL_CONFIG_IS_HOLE "is_hole" #define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram" #define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats" #define ZPOOL_CONFIG_DDT_STATS "ddt_stats" #define ZPOOL_CONFIG_SPLIT "splitcfg" #define ZPOOL_CONFIG_ORIG_GUID "orig_guid" #define ZPOOL_CONFIG_SPLIT_GUID "split_guid" #define ZPOOL_CONFIG_SPLIT_LIST "guid_list" #define ZPOOL_CONFIG_NONALLOCATING "non_allocating" #define ZPOOL_CONFIG_REMOVING "removing" #define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" #define ZPOOL_CONFIG_REBUILD_TXG "rebuild_txg" #define ZPOOL_CONFIG_COMMENT "comment" #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_SUSPENDED_REASON "suspended_reason" /* not stored */ #define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ #define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ #define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */ #define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ #define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */ #define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */ #define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */ #define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */ #define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" #define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ #define ZPOOL_CONFIG_ERRATA "errata" /* not stored on disk */ #define ZPOOL_CONFIG_VDEV_ROOT_ZAP "com.klarasystems:vdev_zap_root" #define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top" #define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf" #define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps" #define ZPOOL_CONFIG_RESILVER_DEFER "com.datto:resilver_defer" #define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_SEQ "mmp_seq" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */ #define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */ #define ZPOOL_CONFIG_EXPANSION_TIME "expansion_time" /* not stored */ #define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats" #define ZPOOL_CONFIG_COMPATIBILITY "compatibility" /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such * as offline and degraded. */ #define ZPOOL_CONFIG_OFFLINE "offline" #define ZPOOL_CONFIG_FAULTED "faulted" #define ZPOOL_CONFIG_DEGRADED "degraded" #define ZPOOL_CONFIG_REMOVED "removed" #define ZPOOL_CONFIG_FRU "fru" #define ZPOOL_CONFIG_AUX_STATE "aux_state" /* Pool load policy parameters */ #define ZPOOL_LOAD_POLICY "load-policy" #define ZPOOL_LOAD_REWIND_POLICY "load-rewind-policy" #define ZPOOL_LOAD_REQUEST_TXG "load-request-txg" #define ZPOOL_LOAD_META_THRESH "load-meta-thresh" #define ZPOOL_LOAD_DATA_THRESH "load-data-thresh" /* Rewind data discovered */ #define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts" #define ZPOOL_CONFIG_LOAD_META_ERRORS "verify_meta_errors" #define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" #define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" /* dRAID configuration */ #define ZPOOL_CONFIG_DRAID_NDATA "draid_ndata" #define ZPOOL_CONFIG_DRAID_NSPARES "draid_nspares" #define ZPOOL_CONFIG_DRAID_NGROUPS "draid_ngroups" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" #define VDEV_TYPE_DRAID "draid" #define VDEV_TYPE_DRAID_SPARE "dspare" #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" #define VDEV_TYPE_HOLE "hole" #define VDEV_TYPE_SPARE "spare" #define VDEV_TYPE_LOG "log" #define VDEV_TYPE_L2CACHE "l2cache" #define VDEV_TYPE_INDIRECT "indirect" #define VDEV_RAIDZ_MAXPARITY 3 #define VDEV_DRAID_MAXPARITY 3 #define VDEV_DRAID_MIN_CHILDREN 2 #define VDEV_DRAID_MAX_CHILDREN UINT8_MAX /* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */ #define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \ "com.delphix:indirect_obsolete_sm" #define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \ "com.delphix:obsolete_counts_are_precise" #define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \ "com.delphix:pool_checkpoint_sm" #define VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS \ "com.delphix:ms_unflushed_phys_txgs" #define VDEV_TOP_ZAP_VDEV_REBUILD_PHYS \ "org.openzfs:vdev_rebuild" #define VDEV_TOP_ZAP_ALLOCATION_BIAS \ "org.zfsonlinux:allocation_bias" #define VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE \ "org.openzfs:raidz_expand_state" #define VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME \ "org.openzfs:raidz_expand_start_time" #define VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME \ "org.openzfs:raidz_expand_end_time" #define VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED \ "org.openzfs:raidz_expand_bytes_copied" /* vdev metaslab allocation bias */ #define VDEV_ALLOC_BIAS_LOG "log" #define VDEV_ALLOC_BIAS_SPECIAL "special" #define VDEV_ALLOC_BIAS_DEDUP "dedup" /* vdev initialize state */ #define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \ "com.delphix:next_offset_to_initialize" #define VDEV_LEAF_ZAP_INITIALIZE_STATE \ "com.delphix:vdev_initialize_state" #define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \ "com.delphix:vdev_initialize_action_time" /* vdev TRIM state */ #define VDEV_LEAF_ZAP_TRIM_LAST_OFFSET \ "org.zfsonlinux:next_offset_to_trim" #define VDEV_LEAF_ZAP_TRIM_STATE \ "org.zfsonlinux:vdev_trim_state" #define VDEV_LEAF_ZAP_TRIM_ACTION_TIME \ "org.zfsonlinux:vdev_trim_action_time" #define VDEV_LEAF_ZAP_TRIM_RATE \ "org.zfsonlinux:vdev_trim_rate" #define VDEV_LEAF_ZAP_TRIM_PARTIAL \ "org.zfsonlinux:vdev_trim_partial" #define VDEV_LEAF_ZAP_TRIM_SECURE \ "org.zfsonlinux:vdev_trim_secure" /* * This is needed in userland to report the minimum necessary device size. */ #define SPA_MINDEVSIZE (64ULL << 20) /* * Set if the fragmentation has not yet been calculated. This can happen * because the space maps have not been upgraded or the histogram feature * is not enabled. */ #define ZFS_FRAG_INVALID UINT64_MAX /* * The location of the pool configuration repository, shared between kernel and * userland. */ #define ZPOOL_CACHE_BOOT "/boot/zfs/zpool.cache" #define ZPOOL_CACHE "/etc/zfs/zpool.cache" /* * Settings for zpool compatibility features files */ #define ZPOOL_SYSCONF_COMPAT_D SYSCONFDIR "/zfs/compatibility.d" #define ZPOOL_DATA_COMPAT_D PKGDATADIR "/compatibility.d" #define ZPOOL_COMPAT_MAXSIZE 16384 /* * Hard-wired compatibility settings */ #define ZPOOL_COMPAT_LEGACY "legacy" #define ZPOOL_COMPAT_OFF "off" /* * vdev states are ordered from least to most healthy. * A vdev that's CANT_OPEN or below is considered unusable. */ typedef enum vdev_state { VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ VDEV_STATE_CLOSED, /* Not currently open */ VDEV_STATE_OFFLINE, /* Not allowed to open */ VDEV_STATE_REMOVED, /* Explicitly removed from system */ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ VDEV_STATE_FAULTED, /* External request to fault device */ VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ VDEV_STATE_HEALTHY /* Presumed good */ } vdev_state_t; #define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY /* * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field * of the vdev stats structure uses these constants to distinguish why. */ typedef enum vdev_aux { VDEV_AUX_NONE, /* no error */ VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */ VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */ VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */ VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */ VDEV_AUX_TOO_SMALL, /* vdev size is too small */ VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ VDEV_AUX_UNSUP_FEAT, /* unsupported features */ VDEV_AUX_SPARED, /* hot spare used in another pool */ VDEV_AUX_ERR_EXCEEDED, /* too many errors */ VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */ VDEV_AUX_EXTERNAL, /* external diagnosis or forced fault */ VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */ VDEV_AUX_BAD_ASHIFT, /* vdev ashift is invalid */ VDEV_AUX_EXTERNAL_PERSIST, /* persistent forced fault */ VDEV_AUX_ACTIVE, /* vdev active on a different host */ VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */ VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large */ } vdev_aux_t; /* * pool state. The following states are written to disk as part of the normal * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining * states are software abstractions used at various levels to communicate * pool state. */ typedef enum pool_state { POOL_STATE_ACTIVE = 0, /* In active use */ POOL_STATE_EXPORTED, /* Explicitly exported */ POOL_STATE_DESTROYED, /* Explicitly destroyed */ POOL_STATE_SPARE, /* Reserved for hot spare use */ POOL_STATE_L2CACHE, /* Level 2 ARC device */ POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ POOL_STATE_UNAVAIL, /* Internal libzfs state */ POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */ } pool_state_t; /* * mmp state. The following states provide additional detail describing * why a pool couldn't be safely imported. */ typedef enum mmp_state { MMP_STATE_ACTIVE = 0, /* In active use */ MMP_STATE_INACTIVE, /* Inactive and safe to import */ MMP_STATE_NO_HOSTID /* System hostid is not set */ } mmp_state_t; /* * Scan Functions. */ typedef enum pool_scan_func { POOL_SCAN_NONE, POOL_SCAN_SCRUB, POOL_SCAN_RESILVER, POOL_SCAN_ERRORSCRUB, POOL_SCAN_FUNCS } pool_scan_func_t; /* * Used to control scrub pause and resume. */ typedef enum pool_scrub_cmd { POOL_SCRUB_NORMAL = 0, POOL_SCRUB_PAUSE, POOL_SCRUB_FROM_LAST_TXG, POOL_SCRUB_FLAGS_END } pool_scrub_cmd_t; typedef enum { CS_NONE, CS_CHECKPOINT_EXISTS, CS_CHECKPOINT_DISCARDING, CS_NUM_STATES } checkpoint_state_t; typedef struct pool_checkpoint_stat { uint64_t pcs_state; /* checkpoint_state_t */ uint64_t pcs_start_time; /* time checkpoint/discard started */ uint64_t pcs_space; /* checkpointed space */ } pool_checkpoint_stat_t; /* * ZIO types. Needed to interpret vdev statistics below. */ typedef enum zio_type { ZIO_TYPE_NULL = 0, ZIO_TYPE_READ, ZIO_TYPE_WRITE, ZIO_TYPE_FREE, ZIO_TYPE_CLAIM, ZIO_TYPE_FLUSH, ZIO_TYPE_TRIM, ZIO_TYPES } zio_type_t; /* * Compatibility: _IOCTL was renamed to _FLUSH; keep the old name available to * user programs. */ #define ZIO_TYPE_IOCTL ZIO_TYPE_FLUSH /* * ZIO priority types. Needed to interpret vdev statistics below. * * NOTE: PLEASE UPDATE THE ENUM STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER * VALUE. */ typedef enum zio_priority { ZIO_PRIORITY_SYNC_READ, ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ ZIO_PRIORITY_ASYNC_READ, /* prefetch */ ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */ ZIO_PRIORITY_INITIALIZING, /* initializing I/O */ ZIO_PRIORITY_TRIM, /* trim I/O (discard) */ ZIO_PRIORITY_REBUILD, /* reads/writes for vdev rebuild */ ZIO_PRIORITY_NUM_QUEUEABLE, ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */ } zio_priority_t; /* * Pool statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct pool_scan_stat { /* values stored on disk */ uint64_t pss_func; /* pool_scan_func_t */ uint64_t pss_state; /* dsl_scan_state_t */ uint64_t pss_start_time; /* scan start time */ uint64_t pss_end_time; /* scan end time */ uint64_t pss_to_examine; /* total bytes to scan */ uint64_t pss_examined; /* total bytes located by scanner */ uint64_t pss_skipped; /* total bytes skipped by scanner */ uint64_t pss_processed; /* total processed bytes */ uint64_t pss_errors; /* scan errors */ /* values not stored on disk */ uint64_t pss_pass_exam; /* examined bytes per scan pass */ uint64_t pss_pass_start; /* start time of a scan pass */ uint64_t pss_pass_scrub_pause; /* pause time of a scrub pass */ /* cumulative time scrub spent paused, needed for rate calculation */ uint64_t pss_pass_scrub_spent_paused; uint64_t pss_pass_issued; /* issued bytes per scan pass */ uint64_t pss_issued; /* total bytes checked by scanner */ /* error scrub values stored on disk */ uint64_t pss_error_scrub_func; /* pool_scan_func_t */ uint64_t pss_error_scrub_state; /* dsl_scan_state_t */ uint64_t pss_error_scrub_start; /* error scrub start time */ uint64_t pss_error_scrub_end; /* error scrub end time */ uint64_t pss_error_scrub_examined; /* error blocks issued I/O */ /* error blocks to be issued I/O */ uint64_t pss_error_scrub_to_be_examined; /* error scrub values not stored on disk */ /* error scrub pause time in milliseconds */ uint64_t pss_pass_error_scrub_pause; } pool_scan_stat_t; typedef struct pool_removal_stat { uint64_t prs_state; /* dsl_scan_state_t */ uint64_t prs_removing_vdev; uint64_t prs_start_time; uint64_t prs_end_time; uint64_t prs_to_copy; /* bytes that need to be copied */ uint64_t prs_copied; /* bytes copied so far */ /* * bytes of memory used for indirect mappings. * This includes all removed vdevs. */ uint64_t prs_mapping_memory; } pool_removal_stat_t; typedef struct pool_raidz_expand_stat { uint64_t pres_state; /* dsl_scan_state_t */ uint64_t pres_expanding_vdev; uint64_t pres_start_time; uint64_t pres_end_time; uint64_t pres_to_reflow; /* bytes that need to be moved */ uint64_t pres_reflowed; /* bytes moved so far */ uint64_t pres_waiting_for_resilver; } pool_raidz_expand_stat_t; typedef enum dsl_scan_state { DSS_NONE, DSS_SCANNING, DSS_FINISHED, DSS_CANCELED, DSS_ERRORSCRUBBING, DSS_NUM_STATES } dsl_scan_state_t; typedef struct vdev_rebuild_stat { uint64_t vrs_state; /* vdev_rebuild_state_t */ uint64_t vrs_start_time; /* time_t */ uint64_t vrs_end_time; /* time_t */ uint64_t vrs_scan_time_ms; /* total run time (millisecs) */ uint64_t vrs_bytes_scanned; /* allocated bytes scanned */ uint64_t vrs_bytes_issued; /* read bytes issued */ uint64_t vrs_bytes_rebuilt; /* rebuilt bytes */ uint64_t vrs_bytes_est; /* total bytes to scan */ uint64_t vrs_errors; /* scanning errors */ uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */ uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */ uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */ uint64_t vrs_pass_bytes_skipped; /* bytes skipped since start/resume */ } vdev_rebuild_stat_t; /* * Errata described by https://openzfs.github.io/openzfs-docs/msg/ZFS-8000-ER. * The ordering of this enum must be maintained to ensure the errata identifiers * map to the correct documentation. New errata may only be appended to the * list and must contain corresponding documentation at the above link. */ typedef enum zpool_errata { ZPOOL_ERRATA_NONE, ZPOOL_ERRATA_ZOL_2094_SCRUB, ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY, ZPOOL_ERRATA_ZOL_6845_ENCRYPTION, ZPOOL_ERRATA_ZOL_8308_ENCRYPTION, } zpool_errata_t; /* * Vdev statistics. Note: all fields should be 64-bit because this * is passed between kernel and user land as an nvlist uint64 array. * * The vs_ops[] and vs_bytes[] arrays must always be an array size of 6 in * order to keep subsequent members at their known fixed offsets. When * adding a new field it must be added to the end the structure. */ #define VS_ZIO_TYPES 6 typedef struct vdev_stat { hrtime_t vs_timestamp; /* time since vdev load */ uint64_t vs_state; /* vdev state */ uint64_t vs_aux; /* see vdev_aux_t */ uint64_t vs_alloc; /* space allocated */ uint64_t vs_space; /* total capacity */ uint64_t vs_dspace; /* deflated capacity */ uint64_t vs_rsize; /* replaceable dev size */ uint64_t vs_esize; /* expandable dev size */ uint64_t vs_ops[VS_ZIO_TYPES]; /* operation count */ uint64_t vs_bytes[VS_ZIO_TYPES]; /* bytes read/written */ uint64_t vs_read_errors; /* read errors */ uint64_t vs_write_errors; /* write errors */ uint64_t vs_checksum_errors; /* checksum errors */ uint64_t vs_initialize_errors; /* initializing errors */ uint64_t vs_self_healed; /* self-healed bytes */ uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ uint64_t vs_fragmentation; /* device fragmentation */ uint64_t vs_initialize_bytes_done; /* bytes initialized */ uint64_t vs_initialize_bytes_est; /* total bytes to initialize */ uint64_t vs_initialize_state; /* vdev_initializing_state_t */ uint64_t vs_initialize_action_time; /* time_t */ uint64_t vs_checkpoint_space; /* checkpoint-consumed space */ uint64_t vs_resilver_deferred; /* resilver deferred */ uint64_t vs_slow_ios; /* slow IOs */ uint64_t vs_trim_errors; /* trimming errors */ uint64_t vs_trim_notsup; /* supported by device */ uint64_t vs_trim_bytes_done; /* bytes trimmed */ uint64_t vs_trim_bytes_est; /* total bytes to trim */ uint64_t vs_trim_state; /* vdev_trim_state_t */ uint64_t vs_trim_action_time; /* time_t */ uint64_t vs_rebuild_processed; /* bytes rebuilt */ uint64_t vs_configured_ashift; /* TLV vdev_ashift */ uint64_t vs_logical_ashift; /* vdev_logical_ashift */ uint64_t vs_physical_ashift; /* vdev_physical_ashift */ uint64_t vs_noalloc; /* allocations halted? */ uint64_t vs_pspace; /* physical capacity */ uint64_t vs_dio_verify_errors; /* DIO write verify errors */ } vdev_stat_t; #define VDEV_STAT_VALID(field, uint64_t_field_count) \ ((uint64_t_field_count * sizeof (uint64_t)) >= \ (offsetof(vdev_stat_t, field) + sizeof (((vdev_stat_t *)NULL)->field))) /* * Extended stats * * These are stats which aren't included in the original iostat output. For * convenience, they are grouped together in vdev_stat_ex, although each stat * is individually exported as an nvlist. */ typedef struct vdev_stat_ex { /* Number of ZIOs issued to disk and waiting to finish */ uint64_t vsx_active_queue[ZIO_PRIORITY_NUM_QUEUEABLE]; /* Number of ZIOs pending to be issued to disk */ uint64_t vsx_pend_queue[ZIO_PRIORITY_NUM_QUEUEABLE]; /* * Below are the histograms for various latencies. Buckets are in * units of nanoseconds. */ /* * 2^37 nanoseconds = 134s. Timeouts will probably start kicking in * before this. */ #define VDEV_L_HISTO_BUCKETS 37 /* Latency histo buckets */ #define VDEV_RQ_HISTO_BUCKETS 25 /* Request size histo buckets */ /* Amount of time in ZIO queue (ns) */ uint64_t vsx_queue_histo[ZIO_PRIORITY_NUM_QUEUEABLE] [VDEV_L_HISTO_BUCKETS]; /* Total ZIO latency (ns). Includes queuing and disk access time */ uint64_t vsx_total_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS]; /* Amount of time to read/write the disk (ns) */ uint64_t vsx_disk_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS]; /* "lookup the bucket for a value" histogram macros */ #define HISTO(val, buckets) (val != 0 ? MIN(highbit64(val) - 1, \ buckets - 1) : 0) #define L_HISTO(a) HISTO(a, VDEV_L_HISTO_BUCKETS) #define RQ_HISTO(a) HISTO(a, VDEV_RQ_HISTO_BUCKETS) /* Physical IO histogram */ uint64_t vsx_ind_histo[ZIO_PRIORITY_NUM_QUEUEABLE] [VDEV_RQ_HISTO_BUCKETS]; /* Delegated (aggregated) physical IO histogram */ uint64_t vsx_agg_histo[ZIO_PRIORITY_NUM_QUEUEABLE] [VDEV_RQ_HISTO_BUCKETS]; } vdev_stat_ex_t; /* * Initialize functions. */ typedef enum pool_initialize_func { POOL_INITIALIZE_START, POOL_INITIALIZE_CANCEL, POOL_INITIALIZE_SUSPEND, POOL_INITIALIZE_UNINIT, POOL_INITIALIZE_FUNCS } pool_initialize_func_t; /* * TRIM functions. */ typedef enum pool_trim_func { POOL_TRIM_START, POOL_TRIM_CANCEL, POOL_TRIM_SUSPEND, POOL_TRIM_FUNCS } pool_trim_func_t; /* * DDT statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct ddt_object { uint64_t ddo_count; /* number of elements in ddt */ uint64_t ddo_dspace; /* size of ddt on disk */ uint64_t ddo_mspace; /* size of ddt in-core */ } ddt_object_t; typedef struct ddt_stat { uint64_t dds_blocks; /* blocks */ uint64_t dds_lsize; /* logical size */ uint64_t dds_psize; /* physical size */ uint64_t dds_dsize; /* deflated allocated size */ uint64_t dds_ref_blocks; /* referenced blocks */ uint64_t dds_ref_lsize; /* referenced lsize * refcnt */ uint64_t dds_ref_psize; /* referenced psize * refcnt */ uint64_t dds_ref_dsize; /* referenced dsize * refcnt */ } ddt_stat_t; typedef struct ddt_histogram { ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */ } ddt_histogram_t; #define ZVOL_DRIVER "zvol" #define ZFS_DRIVER "zfs" #define ZFS_DEV "/dev/zfs" #define ZFS_DEVDIR "/dev" #define ZFS_SUPER_MAGIC 0x2fc12fc1 /* general zvol path */ #define ZVOL_DIR "/dev/zvol/" #define ZVOL_MAJOR 230 #define ZVOL_MINOR_BITS 4 #define ZVOL_MINOR_MASK ((1U << ZVOL_MINOR_BITS) - 1) #define ZVOL_MINORS (1 << 4) #define ZVOL_DEV_NAME "zd" #define ZVOL_PROP_NAME "name" #define ZVOL_DEFAULT_BLOCKSIZE 16384 typedef enum { VDEV_INITIALIZE_NONE, VDEV_INITIALIZE_ACTIVE, VDEV_INITIALIZE_CANCELED, VDEV_INITIALIZE_SUSPENDED, VDEV_INITIALIZE_COMPLETE } vdev_initializing_state_t; typedef enum { VDEV_TRIM_NONE, VDEV_TRIM_ACTIVE, VDEV_TRIM_CANCELED, VDEV_TRIM_SUSPENDED, VDEV_TRIM_COMPLETE, } vdev_trim_state_t; typedef enum { VDEV_REBUILD_NONE, VDEV_REBUILD_ACTIVE, VDEV_REBUILD_CANCELED, VDEV_REBUILD_COMPLETE, } vdev_rebuild_state_t; /* * nvlist name constants. Facilitate restricting snapshot iteration range for * the "list next snapshot" ioctl */ #define SNAP_ITER_MIN_TXG "snap_iter_min_txg" #define SNAP_ITER_MAX_TXG "snap_iter_max_txg" /* * /dev/zfs ioctl numbers. * * These numbers cannot change over time. New ioctl numbers must be appended. */ typedef enum zfs_ioc { /* * Core features - 89/128 numbers reserved. */ #ifdef __FreeBSD__ ZFS_IOC_FIRST = 0, #else ZFS_IOC_FIRST = ('Z' << 8), #endif ZFS_IOC = ZFS_IOC_FIRST, ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST, /* 0x5a00 */ ZFS_IOC_POOL_DESTROY, /* 0x5a01 */ ZFS_IOC_POOL_IMPORT, /* 0x5a02 */ ZFS_IOC_POOL_EXPORT, /* 0x5a03 */ ZFS_IOC_POOL_CONFIGS, /* 0x5a04 */ ZFS_IOC_POOL_STATS, /* 0x5a05 */ ZFS_IOC_POOL_TRYIMPORT, /* 0x5a06 */ ZFS_IOC_POOL_SCAN, /* 0x5a07 */ ZFS_IOC_POOL_FREEZE, /* 0x5a08 */ ZFS_IOC_POOL_UPGRADE, /* 0x5a09 */ ZFS_IOC_POOL_GET_HISTORY, /* 0x5a0a */ ZFS_IOC_VDEV_ADD, /* 0x5a0b */ ZFS_IOC_VDEV_REMOVE, /* 0x5a0c */ ZFS_IOC_VDEV_SET_STATE, /* 0x5a0d */ ZFS_IOC_VDEV_ATTACH, /* 0x5a0e */ ZFS_IOC_VDEV_DETACH, /* 0x5a0f */ ZFS_IOC_VDEV_SETPATH, /* 0x5a10 */ ZFS_IOC_VDEV_SETFRU, /* 0x5a11 */ ZFS_IOC_OBJSET_STATS, /* 0x5a12 */ ZFS_IOC_OBJSET_ZPLPROPS, /* 0x5a13 */ ZFS_IOC_DATASET_LIST_NEXT, /* 0x5a14 */ ZFS_IOC_SNAPSHOT_LIST_NEXT, /* 0x5a15 */ ZFS_IOC_SET_PROP, /* 0x5a16 */ ZFS_IOC_CREATE, /* 0x5a17 */ ZFS_IOC_DESTROY, /* 0x5a18 */ ZFS_IOC_ROLLBACK, /* 0x5a19 */ ZFS_IOC_RENAME, /* 0x5a1a */ ZFS_IOC_RECV, /* 0x5a1b */ ZFS_IOC_SEND, /* 0x5a1c */ ZFS_IOC_INJECT_FAULT, /* 0x5a1d */ ZFS_IOC_CLEAR_FAULT, /* 0x5a1e */ ZFS_IOC_INJECT_LIST_NEXT, /* 0x5a1f */ ZFS_IOC_ERROR_LOG, /* 0x5a20 */ ZFS_IOC_CLEAR, /* 0x5a21 */ ZFS_IOC_PROMOTE, /* 0x5a22 */ ZFS_IOC_SNAPSHOT, /* 0x5a23 */ ZFS_IOC_DSOBJ_TO_DSNAME, /* 0x5a24 */ ZFS_IOC_OBJ_TO_PATH, /* 0x5a25 */ ZFS_IOC_POOL_SET_PROPS, /* 0x5a26 */ ZFS_IOC_POOL_GET_PROPS, /* 0x5a27 */ ZFS_IOC_SET_FSACL, /* 0x5a28 */ ZFS_IOC_GET_FSACL, /* 0x5a29 */ ZFS_IOC_SHARE, /* 0x5a2a */ ZFS_IOC_INHERIT_PROP, /* 0x5a2b */ ZFS_IOC_SMB_ACL, /* 0x5a2c */ ZFS_IOC_USERSPACE_ONE, /* 0x5a2d */ ZFS_IOC_USERSPACE_MANY, /* 0x5a2e */ ZFS_IOC_USERSPACE_UPGRADE, /* 0x5a2f */ ZFS_IOC_HOLD, /* 0x5a30 */ ZFS_IOC_RELEASE, /* 0x5a31 */ ZFS_IOC_GET_HOLDS, /* 0x5a32 */ ZFS_IOC_OBJSET_RECVD_PROPS, /* 0x5a33 */ ZFS_IOC_VDEV_SPLIT, /* 0x5a34 */ ZFS_IOC_NEXT_OBJ, /* 0x5a35 */ ZFS_IOC_DIFF, /* 0x5a36 */ ZFS_IOC_TMP_SNAPSHOT, /* 0x5a37 */ ZFS_IOC_OBJ_TO_STATS, /* 0x5a38 */ ZFS_IOC_SPACE_WRITTEN, /* 0x5a39 */ ZFS_IOC_SPACE_SNAPS, /* 0x5a3a */ ZFS_IOC_DESTROY_SNAPS, /* 0x5a3b */ ZFS_IOC_POOL_REGUID, /* 0x5a3c */ ZFS_IOC_POOL_REOPEN, /* 0x5a3d */ ZFS_IOC_SEND_PROGRESS, /* 0x5a3e */ ZFS_IOC_LOG_HISTORY, /* 0x5a3f */ ZFS_IOC_SEND_NEW, /* 0x5a40 */ ZFS_IOC_SEND_SPACE, /* 0x5a41 */ ZFS_IOC_CLONE, /* 0x5a42 */ ZFS_IOC_BOOKMARK, /* 0x5a43 */ ZFS_IOC_GET_BOOKMARKS, /* 0x5a44 */ ZFS_IOC_DESTROY_BOOKMARKS, /* 0x5a45 */ ZFS_IOC_RECV_NEW, /* 0x5a46 */ ZFS_IOC_POOL_SYNC, /* 0x5a47 */ ZFS_IOC_CHANNEL_PROGRAM, /* 0x5a48 */ ZFS_IOC_LOAD_KEY, /* 0x5a49 */ ZFS_IOC_UNLOAD_KEY, /* 0x5a4a */ ZFS_IOC_CHANGE_KEY, /* 0x5a4b */ ZFS_IOC_REMAP, /* 0x5a4c */ ZFS_IOC_POOL_CHECKPOINT, /* 0x5a4d */ ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x5a4e */ ZFS_IOC_POOL_INITIALIZE, /* 0x5a4f */ ZFS_IOC_POOL_TRIM, /* 0x5a50 */ ZFS_IOC_REDACT, /* 0x5a51 */ ZFS_IOC_GET_BOOKMARK_PROPS, /* 0x5a52 */ ZFS_IOC_WAIT, /* 0x5a53 */ ZFS_IOC_WAIT_FS, /* 0x5a54 */ ZFS_IOC_VDEV_GET_PROPS, /* 0x5a55 */ ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */ ZFS_IOC_POOL_SCRUB, /* 0x5a57 */ ZFS_IOC_POOL_PREFETCH, /* 0x5a58 */ ZFS_IOC_DDT_PRUNE, /* 0x5a59 */ /* * Per-platform (Optional) - 8/128 numbers reserved. */ ZFS_IOC_PLATFORM = ZFS_IOC_FIRST + 0x80, ZFS_IOC_EVENTS_NEXT, /* 0x81 (Linux) */ ZFS_IOC_EVENTS_CLEAR, /* 0x82 (Linux) */ ZFS_IOC_EVENTS_SEEK, /* 0x83 (Linux) */ ZFS_IOC_NEXTBOOT, /* 0x84 (FreeBSD) */ ZFS_IOC_JAIL, /* 0x85 (FreeBSD) */ ZFS_IOC_USERNS_ATTACH = ZFS_IOC_JAIL, /* 0x85 (Linux) */ ZFS_IOC_UNJAIL, /* 0x86 (FreeBSD) */ ZFS_IOC_USERNS_DETACH = ZFS_IOC_UNJAIL, /* 0x86 (Linux) */ ZFS_IOC_SET_BOOTENV, /* 0x87 */ ZFS_IOC_GET_BOOTENV, /* 0x88 */ ZFS_IOC_LAST } zfs_ioc_t; /* * zvol ioctl to get dataset name */ #define BLKZNAME _IOR(0x12, 125, char[ZFS_MAX_DATASET_NAME_LEN]) #ifdef __linux__ /* * IOCTLs to update and retrieve additional file level attributes on * Linux. */ #define ZFS_IOC_GETDOSFLAGS _IOR(0x83, 1, uint64_t) #define ZFS_IOC_SETDOSFLAGS _IOW(0x83, 2, uint64_t) /* * Additional file level attributes, that are stored * in the upper half of z_pflags */ #define ZFS_READONLY 0x0000000100000000ull #define ZFS_HIDDEN 0x0000000200000000ull #define ZFS_SYSTEM 0x0000000400000000ull #define ZFS_ARCHIVE 0x0000000800000000ull #define ZFS_IMMUTABLE 0x0000001000000000ull #define ZFS_NOUNLINK 0x0000002000000000ull #define ZFS_APPENDONLY 0x0000004000000000ull #define ZFS_NODUMP 0x0000008000000000ull #define ZFS_OPAQUE 0x0000010000000000ull #define ZFS_AV_QUARANTINED 0x0000020000000000ull #define ZFS_AV_MODIFIED 0x0000040000000000ull #define ZFS_REPARSE 0x0000080000000000ull #define ZFS_OFFLINE 0x0000100000000000ull #define ZFS_SPARSE 0x0000200000000000ull #define ZFS_DOS_FL_USER_VISIBLE (ZFS_IMMUTABLE | ZFS_APPENDONLY | \ ZFS_NOUNLINK | ZFS_ARCHIVE | ZFS_NODUMP | ZFS_SYSTEM | \ ZFS_HIDDEN | ZFS_READONLY | ZFS_REPARSE | ZFS_OFFLINE | \ ZFS_SPARSE) #endif typedef struct zfs_rewrite_args { uint64_t off; uint64_t len; uint64_t flags; uint64_t arg; } zfs_rewrite_args_t; /* zfs_rewrite_args flags */ #define ZFS_REWRITE_PHYSICAL 0x1 /* Preserve logical birth time. */ #define ZFS_IOC_REWRITE _IOW(0x83, 3, zfs_rewrite_args_t) /* * ZFS-specific error codes used for returning descriptive errors * to the userland through zfs ioctls. * * The enum implicitly includes all the error codes from errno.h. * New code should use and extend this enum for errors that are * not described precisely by generic errno codes. * * These numbers should not change over time. New entries should be appended. * * (Keep in sync with contrib/pyzfs/libzfs_core/_constants.py) */ typedef enum { ZFS_ERR_CHECKPOINT_EXISTS = 1024, ZFS_ERR_DISCARDING_CHECKPOINT, ZFS_ERR_NO_CHECKPOINT, ZFS_ERR_DEVRM_IN_PROGRESS, ZFS_ERR_VDEV_TOO_BIG, ZFS_ERR_IOC_CMD_UNAVAIL, ZFS_ERR_IOC_ARG_UNAVAIL, ZFS_ERR_IOC_ARG_REQUIRED, ZFS_ERR_IOC_ARG_BADTYPE, ZFS_ERR_WRONG_PARENT, ZFS_ERR_FROM_IVSET_GUID_MISSING, ZFS_ERR_FROM_IVSET_GUID_MISMATCH, ZFS_ERR_SPILL_BLOCK_FLAG_MISSING, ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE, ZFS_ERR_EXPORT_IN_PROGRESS, ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR, ZFS_ERR_STREAM_TRUNCATED, ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH, ZFS_ERR_RESILVER_IN_PROGRESS, ZFS_ERR_REBUILD_IN_PROGRESS, ZFS_ERR_BADPROP, ZFS_ERR_VDEV_NOTSUP, ZFS_ERR_NOT_USER_NAMESPACE, ZFS_ERR_RESUME_EXISTS, ZFS_ERR_CRYPTO_NOTSUP, ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, ZFS_ERR_ASHIFT_MISMATCH, ZFS_ERR_STREAM_LARGE_MICROZAP, ZFS_ERR_TOO_MANY_SITOUTS, } zfs_errno_t; /* * Internal SPA load state. Used by FMA diagnosis engine. */ typedef enum { SPA_LOAD_NONE, /* no load in progress */ SPA_LOAD_OPEN, /* normal open */ SPA_LOAD_IMPORT, /* import in progress */ SPA_LOAD_TRYIMPORT, /* tryimport in progress */ SPA_LOAD_RECOVER, /* recovery requested */ SPA_LOAD_ERROR, /* load failed */ SPA_LOAD_CREATE /* creation in progress */ } spa_load_state_t; typedef enum { ZPOOL_WAIT_CKPT_DISCARD, ZPOOL_WAIT_FREE, ZPOOL_WAIT_INITIALIZE, ZPOOL_WAIT_REPLACE, ZPOOL_WAIT_REMOVE, ZPOOL_WAIT_RESILVER, ZPOOL_WAIT_SCRUB, ZPOOL_WAIT_TRIM, ZPOOL_WAIT_RAIDZ_EXPAND, ZPOOL_WAIT_NUM_ACTIVITIES } zpool_wait_activity_t; typedef enum { ZFS_WAIT_DELETEQ, ZFS_WAIT_NUM_ACTIVITIES } zfs_wait_activity_t; typedef enum { ZPOOL_PREFETCH_NONE = 0, ZPOOL_PREFETCH_DDT } zpool_prefetch_type_t; typedef enum { ZPOOL_DDT_PRUNE_NONE, ZPOOL_DDT_PRUNE_AGE, /* in seconds */ ZPOOL_DDT_PRUNE_PERCENTAGE, /* 1 - 100 */ } zpool_ddt_prune_unit_t; /* * Bookmark name values. */ #define ZPOOL_ERR_LIST "error list" #define ZPOOL_ERR_DATASET "dataset" #define ZPOOL_ERR_OBJECT "object" #define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1) /* * The following are names used in the nvlist describing * the pool's history log. */ #define ZPOOL_HIST_RECORD "history record" #define ZPOOL_HIST_TIME "history time" #define ZPOOL_HIST_CMD "history command" #define ZPOOL_HIST_WHO "history who" #define ZPOOL_HIST_ZONE "history zone" #define ZPOOL_HIST_HOST "history hostname" #define ZPOOL_HIST_TXG "history txg" #define ZPOOL_HIST_INT_EVENT "history internal event" #define ZPOOL_HIST_INT_STR "history internal str" #define ZPOOL_HIST_INT_NAME "internal_name" #define ZPOOL_HIST_IOCTL "ioctl" #define ZPOOL_HIST_INPUT_NVL "in_nvl" #define ZPOOL_HIST_OUTPUT_NVL "out_nvl" #define ZPOOL_HIST_OUTPUT_SIZE "out_size" #define ZPOOL_HIST_DSNAME "dsname" #define ZPOOL_HIST_DSID "dsid" #define ZPOOL_HIST_ERRNO "errno" #define ZPOOL_HIST_ELAPSED_NS "elapsed_ns" /* * Special nvlist name that will not have its args recorded in the pool's * history log. */ #define ZPOOL_HIDDEN_ARGS "hidden_args" /* * The following is used when invoking ZFS_IOC_POOL_GET_PROPS. */ #define ZPOOL_GET_PROPS_NAMES "get_props_names" /* * Opt-in property names used with ZPOOL_GET_PROPS_NAMES. * For example, properties that are hidden or expensive to compute. */ #define ZPOOL_DEDUPCACHED_PROP_NAME "dedupcached" /* * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE. */ #define ZPOOL_INITIALIZE_COMMAND "initialize_command" #define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs" /* * The following are names used when invoking ZFS_IOC_POOL_REGUID. */ #define ZPOOL_REGUID_GUID "guid" /* * The following are names used when invoking ZFS_IOC_POOL_TRIM. */ #define ZPOOL_TRIM_COMMAND "trim_command" #define ZPOOL_TRIM_VDEVS "trim_vdevs" #define ZPOOL_TRIM_RATE "trim_rate" #define ZPOOL_TRIM_SECURE "trim_secure" /* * The following are names used when invoking ZFS_IOC_POOL_WAIT. */ #define ZPOOL_WAIT_ACTIVITY "wait_activity" #define ZPOOL_WAIT_TAG "wait_tag" #define ZPOOL_WAIT_WAITED "wait_waited" /* * The following are names used when invoking ZFS_IOC_VDEV_GET_PROP. */ #define ZPOOL_VDEV_PROPS_GET_VDEV "vdevprops_get_vdev" #define ZPOOL_VDEV_PROPS_GET_PROPS "vdevprops_get_props" /* * The following are names used when invoking ZFS_IOC_VDEV_SET_PROP. */ #define ZPOOL_VDEV_PROPS_SET_VDEV "vdevprops_set_vdev" #define ZPOOL_VDEV_PROPS_SET_PROPS "vdevprops_set_props" /* * The following are names used when invoking ZFS_IOC_WAIT_FS. */ #define ZFS_WAIT_ACTIVITY "wait_activity" #define ZFS_WAIT_WAITED "wait_waited" /* * The following are names used when invoking ZFS_IOC_POOL_PREFETCH. */ #define ZPOOL_PREFETCH_TYPE "prefetch_type" /* * The following are names used when invoking ZFS_IOC_DDT_PRUNE. */ #define DDT_PRUNE_UNIT "ddt_prune_unit" #define DDT_PRUNE_AMOUNT "ddt_prune_amount" /* * Flags for ZFS_IOC_VDEV_SET_STATE */ #define ZFS_ONLINE_CHECKREMOVE 0x1 #define ZFS_ONLINE_UNSPARE 0x2 #define ZFS_ONLINE_FORCEFAULT 0x4 #define ZFS_ONLINE_EXPAND 0x8 #define ZFS_ONLINE_SPARE 0x10 #define ZFS_OFFLINE_TEMPORARY 0x1 /* * Flags for ZFS_IOC_POOL_IMPORT */ #define ZFS_IMPORT_NORMAL 0x0 #define ZFS_IMPORT_VERBATIM 0x1 #define ZFS_IMPORT_ANY_HOST 0x2 #define ZFS_IMPORT_MISSING_LOG 0x4 #define ZFS_IMPORT_ONLY 0x8 #define ZFS_IMPORT_TEMP_NAME 0x10 #define ZFS_IMPORT_SKIP_MMP 0x20 #define ZFS_IMPORT_LOAD_KEYS 0x40 #define ZFS_IMPORT_CHECKPOINT 0x80 /* * Channel program argument/return nvlist keys and defaults. */ #define ZCP_ARG_PROGRAM "program" #define ZCP_ARG_ARGLIST "arg" #define ZCP_ARG_SYNC "sync" #define ZCP_ARG_INSTRLIMIT "instrlimit" #define ZCP_ARG_MEMLIMIT "memlimit" #define ZCP_ARG_CLIARGV "argv" #define ZCP_RET_ERROR "error" #define ZCP_RET_RETURN "return" #define ZCP_DEFAULT_INSTRLIMIT (10 * 1000 * 1000) #define ZCP_MAX_INSTRLIMIT (10 * ZCP_DEFAULT_INSTRLIMIT) #define ZCP_DEFAULT_MEMLIMIT (10 * 1024 * 1024) #define ZCP_MAX_MEMLIMIT (10 * ZCP_DEFAULT_MEMLIMIT) /* * Sysevent payload members. ZFS will generate the following sysevents with the * given payloads: * * ESC_ZFS_RESILVER_START * ESC_ZFS_RESILVER_FINISH * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_RESILVER_TYPE DATA_TYPE_STRING * * ESC_ZFS_POOL_DESTROY * ESC_ZFS_POOL_REGUID * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * * ESC_ZFS_VDEV_REMOVE * ESC_ZFS_VDEV_CLEAR * ESC_ZFS_VDEV_CHECK * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional) * ZFS_EV_VDEV_GUID DATA_TYPE_UINT64 * * ESC_ZFS_HISTORY_EVENT * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_HIST_TIME DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_CMD DATA_TYPE_STRING (optional) * ZFS_EV_HIST_WHO DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_ZONE DATA_TYPE_STRING (optional) * ZFS_EV_HIST_HOST DATA_TYPE_STRING (optional) * ZFS_EV_HIST_TXG DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_INT_EVENT DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_INT_STR DATA_TYPE_STRING (optional) * ZFS_EV_HIST_INT_NAME DATA_TYPE_STRING (optional) * ZFS_EV_HIST_IOCTL DATA_TYPE_STRING (optional) * ZFS_EV_HIST_DSNAME DATA_TYPE_STRING (optional) * ZFS_EV_HIST_DSID DATA_TYPE_UINT64 (optional) * * The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the * history log nvlist. The keynames will be free of any spaces or other * characters that could be potentially unexpected to consumers of the * sysevents. */ #define ZFS_EV_POOL_NAME "pool_name" #define ZFS_EV_POOL_GUID "pool_guid" #define ZFS_EV_VDEV_PATH "vdev_path" #define ZFS_EV_VDEV_GUID "vdev_guid" #define ZFS_EV_HIST_TIME "history_time" #define ZFS_EV_HIST_CMD "history_command" #define ZFS_EV_HIST_WHO "history_who" #define ZFS_EV_HIST_ZONE "history_zone" #define ZFS_EV_HIST_HOST "history_hostname" #define ZFS_EV_HIST_TXG "history_txg" #define ZFS_EV_HIST_INT_EVENT "history_internal_event" #define ZFS_EV_HIST_INT_STR "history_internal_str" #define ZFS_EV_HIST_INT_NAME "history_internal_name" #define ZFS_EV_HIST_IOCTL "history_ioctl" #define ZFS_EV_HIST_DSNAME "history_dsname" #define ZFS_EV_HIST_DSID "history_dsid" #define ZFS_EV_RESILVER_TYPE "resilver_type" /* * We currently support block sizes from 512 bytes to 16MB. * The benefits of larger blocks, and thus larger IO, need to be weighed * against the cost of COWing a giant block to modify one byte, and the * large latency of reading or writing a large block. * * The recordsize property can not be set larger than zfs_max_recordsize * (default 16MB on 64-bit and 1MB on 32-bit). See the comment near * zfs_max_recordsize in dsl_dataset.c for details. * * Note that although the LSIZE field of the blkptr_t can store sizes up * to 32MB, the dnode's dn_datablkszsec can only store sizes up to * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. */ #define SPA_MINBLOCKSHIFT 9 #define SPA_OLD_MAXBLOCKSHIFT 17 #define SPA_MAXBLOCKSHIFT 24 #define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) #define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) #define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) /* supported encryption algorithms */ enum zio_encrypt { ZIO_CRYPT_INHERIT = 0, ZIO_CRYPT_ON, ZIO_CRYPT_OFF, ZIO_CRYPT_AES_128_CCM, ZIO_CRYPT_AES_192_CCM, ZIO_CRYPT_AES_256_CCM, ZIO_CRYPT_AES_128_GCM, ZIO_CRYPT_AES_192_GCM, ZIO_CRYPT_AES_256_GCM, ZIO_CRYPT_FUNCTIONS }; #define ZIO_CRYPT_ON_VALUE ZIO_CRYPT_AES_256_GCM #define ZIO_CRYPT_DEFAULT ZIO_CRYPT_OFF /* * xattr namespace prefixes. These are forbidden in xattr names. * * For cross-platform compatibility, xattrs in the user namespace should not be * prefixed with the namespace name, but for backwards compatibility with older * ZFS on Linux versions we do prefix the namespace. */ #define ZFS_XA_NS_FREEBSD_PREFIX "freebsd:" #define ZFS_XA_NS_FREEBSD_PREFIX_LEN strlen("freebsd:") #define ZFS_XA_NS_LINUX_SECURITY_PREFIX "security." #define ZFS_XA_NS_LINUX_SECURITY_PREFIX_LEN strlen("security.") #define ZFS_XA_NS_LINUX_SYSTEM_PREFIX "system." #define ZFS_XA_NS_LINUX_SYSTEM_PREFIX_LEN strlen("system.") #define ZFS_XA_NS_LINUX_TRUSTED_PREFIX "trusted." #define ZFS_XA_NS_LINUX_TRUSTED_PREFIX_LEN strlen("trusted.") #define ZFS_XA_NS_LINUX_USER_PREFIX "user." #define ZFS_XA_NS_LINUX_USER_PREFIX_LEN strlen("user.") #define ZFS_XA_NS_PREFIX_MATCH(ns, name) \ (strncmp(name, ZFS_XA_NS_##ns##_PREFIX, \ ZFS_XA_NS_##ns##_PREFIX_LEN) == 0) #define ZFS_XA_NS_PREFIX_FORBIDDEN(name) \ (ZFS_XA_NS_PREFIX_MATCH(FREEBSD, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_SECURITY, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_SYSTEM, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_TRUSTED, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_USER, name)) #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_H */ diff --git a/include/sys/spa.h b/include/sys/spa.h index 66db16b33c51..39971827073f 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1,1307 +1,1308 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Allan Jude * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Datto Inc. */ #ifndef _SYS_SPA_H #define _SYS_SPA_H #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Forward references that lots of things need. */ typedef struct brt_vdev brt_vdev_t; typedef struct spa spa_t; typedef struct vdev vdev_t; typedef struct metaslab metaslab_t; typedef struct metaslab_group metaslab_group_t; typedef struct metaslab_class metaslab_class_t; typedef struct zio zio_t; typedef struct zilog zilog_t; typedef struct spa_aux_vdev spa_aux_vdev_t; typedef struct zbookmark_phys zbookmark_phys_t; typedef struct zbookmark_err_phys zbookmark_err_phys_t; struct bpobj; struct bplist; struct dsl_pool; struct dsl_dataset; struct dsl_crypto_params; /* * Alignment Shift (ashift) is an immutable, internal top-level vdev property * which can only be set at vdev creation time. Physical writes are always done * according to it, which makes 2^ashift the smallest possible IO on a vdev. * * We currently allow values ranging from 512 bytes (2^9 = 512) to 64 KiB * (2^16 = 65,536). */ #define ASHIFT_MIN 9 #define ASHIFT_MAX 16 /* * Size of block to hold the configuration data (a packed nvlist) */ #define SPA_CONFIG_BLOCKSIZE (1ULL << 14) /* * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. * The ASIZE encoding should be at least 64 times larger (6 more bits) * to support up to 4-way RAID-Z mirror mode with worst-case gang block * overhead, three DVAs per bp, plus one more bit in case we do anything * else that expands the ASIZE. */ #define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ #define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ #define SPA_COMPRESSBITS 7 #define SPA_VDEVBITS 24 #define SPA_COMPRESSMASK ((1U << SPA_COMPRESSBITS) - 1) /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). * The members of the dva_t should be considered opaque outside the SPA. */ typedef struct dva { uint64_t dva_word[2]; } dva_t; /* * Some checksums/hashes need a 256-bit initialization salt. This salt is kept * secret and is suitable for use in MAC algorithms as the key. */ typedef struct zio_cksum_salt { uint8_t zcs_bytes[32]; } zio_cksum_salt_t; /* * Each block is described by its DVAs, time of birth, checksum, etc. * The word-by-word, bit-by-bit layout of the blkptr is as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | pad | vdev1 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 2 | pad | vdev2 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 4 | pad | vdev3 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 |G| offset3 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 |R| padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 9 | physical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | fill count | * +-------+-------+-------+-------+-------+-------+-------+-------+ * c | checksum[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * d | checksum[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * e | checksum[2] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * f | checksum[3] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * vdev virtual device ID * offset offset into virtual device * LSIZE logical size * PSIZE physical size (after compression) * ASIZE allocated size (including RAID-Z parity and gang block headers) * cksum checksum function * comp compression function * G gang block indicator * B byteorder (endianness) * D dedup * X encryption * E blkptr_t contains embedded data (see below) * lvl level of indirection * type DMU object type * R rewrite (reallocated/rewritten at phys birth TXG) * phys birth txg when dva[0] was written; zero if same as logical birth txg * note that typically all the dva's would be written in this * txg, but they could be different if they were moved by * device removal. * log. birth transaction group in which the block was logically born * fill count number of non-zero blocks under this bp * checksum[4] 256-bit checksum of the data this bp describes */ /* * The blkptr_t's of encrypted blocks also need to store the encryption * parameters so that the block can be decrypted. This layout is as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | pad | vdev1 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 2 | pad | vdev2 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 4 | salt | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 | IV1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 |R| padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 9 | physical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | IV2 | fill count | * +-------+-------+-------+-------+-------+-------+-------+-------+ * c | checksum[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * d | checksum[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * e | MAC[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * f | MAC[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * salt Salt for generating encryption keys * IV1 First 64 bits of encryption IV * X Block requires encryption handling (set to 1) * E blkptr_t contains embedded data (set to 0, see below) * fill count number of non-zero blocks under this bp (truncated to 32 bits) * IV2 Last 32 bits of encryption IV * checksum[2] 128-bit checksum of the data this bp describes * MAC[2] 128-bit message authentication code for this data * * The X bit being set indicates that this block is one of 3 types. If this is * a level 0 block with an encrypted object type, the block is encrypted * (see BP_IS_ENCRYPTED()). If this is a level 0 block with an unencrypted * object type, this block is authenticated with an HMAC (see * BP_IS_AUTHENTICATED()). Otherwise (if level > 0), this bp will use the MAC * words to store a checksum-of-MACs from the level below (see * BP_HAS_INDIRECT_MAC_CKSUM()). For convenience in the code, BP_IS_PROTECTED() * refers to both encrypted and authenticated blocks and BP_USES_CRYPT() * refers to any of these 3 kinds of blocks. * * The additional encryption parameters are the salt, IV, and MAC which are * explained in greater detail in the block comment at the top of zio_crypt.c. * The MAC occupies half of the checksum space since it serves a very similar * purpose: to prevent data corruption on disk. The only functional difference * is that the checksum is used to detect on-disk corruption whether or not the * encryption key is loaded and the MAC provides additional protection against * malicious disk tampering. We use the 3rd DVA to store the salt and first * 64 bits of the IV. As a result encrypted blocks can only have 2 copies * maximum instead of the normal 3. The last 32 bits of the IV are stored in * the upper bits of what is usually the fill count. Note that only blocks at * level 0 or -2 are ever encrypted, which allows us to guarantee that these * 32 bits are not trampled over by other code (see zio_crypt.c for details). * The salt and IV are not used for authenticated bps or bps with an indirect * MAC checksum, so these blocks can utilize all 3 DVAs and the full 64 bits * for the fill count. */ /* * "Embedded" blkptr_t's don't actually point to a block, instead they * have a data payload embedded in the blkptr_t itself. See the comment * in blkptr.c for more details. * * The blkptr_t is laid out as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | payload | * 1 | payload | * 2 | payload | * 3 | payload | * 4 | payload | * 5 | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | payload | * 8 | payload | * 9 | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | payload | * c | payload | * d | payload | * e | payload | * f | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * payload contains the embedded data * B (byteorder) byteorder (endianness) * D (dedup) padding (set to zero) * X encryption (set to zero) * E (embedded) set to one * lvl indirection level * type DMU object type * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*) * comp compression function of payload * PSIZE size of payload after compression, in bytes * LSIZE logical size of payload, in bytes * note that 25 bits is enough to store the largest * "normal" BP's LSIZE (2^16 * 2^9) in bytes * log. birth transaction group in which the block was logically born * * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded * bp's they are stored in units of SPA_MINBLOCKSHIFT. * Generally, the generic BP_GET_*() macros can be used on embedded BP's. * The B, D, X, lvl, type, and comp fields are stored the same as with normal * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before * other macros, as they assert that they are only used on BP's of the correct * "embedded-ness". Encrypted blkptr_t's cannot be embedded because they use * the payload space for encryption parameters (see the comment above on * how encryption parameters are stored). */ #define BPE_GET_ETYPE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET((bp)->blk_prop, 40, 8)) #define BPE_SET_ETYPE(bp, t) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop, 40, 8, t); \ } while (0) #define BPE_GET_LSIZE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1)) #define BPE_SET_LSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \ } while (0) #define BPE_GET_PSIZE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1)) #define BPE_SET_PSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ } while (0) typedef enum bp_embedded_type { BP_EMBEDDED_TYPE_DATA, BP_EMBEDDED_TYPE_RESERVED, /* Reserved for Delphix byteswap feature. */ BP_EMBEDDED_TYPE_REDACTED, NUM_BP_EMBEDDED_TYPES } bp_embedded_type_t; #define BPE_NUM_WORDS 14 #define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) #define BPE_IS_PAYLOADWORD(bp, wp) \ ((wp) != &(bp)->blk_prop && (wp) != (&(bp)->blk_birth_word[1])) #define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ #define SPA_SYNC_MIN_VDEVS 3 /* min vdevs to update during sync */ /* * A block is a hole when it has either 1) never been written to, or * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads * without physically allocating disk space. Holes are represented in the * blkptr_t structure by zeroed blk_dva. Correct checking for holes is * done through the BP_IS_HOLE macro. For holes, the logical size, level, * DMU object type, and birth times are all also stored for holes that * were written to at some point (i.e. were punched after having been filled). */ typedef struct blkptr { dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ uint64_t blk_prop; /* size, compression, type, etc */ uint64_t blk_prop2; /* additional properties */ uint64_t blk_pad; /* Extra space for the future */ uint64_t blk_birth_word[2]; uint64_t blk_fill; /* fill count */ zio_cksum_t blk_cksum; /* 256-bit checksum */ } blkptr_t; /* * Macros to get and set fields in a bp or DVA. */ /* * Note, for gang blocks, DVA_GET_ASIZE() is the total space allocated for * this gang DVA including its children BP's. The space allocated at this * DVA's vdev/offset is vdev_gang_header_asize(vdev). */ #define DVA_GET_ASIZE(dva) \ BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0) #define DVA_SET_ASIZE(dva, x) \ BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \ SPA_MINBLOCKSHIFT, 0, x) #define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS) #define DVA_SET_VDEV(dva, x) \ BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x) #define DVA_GET_OFFSET(dva) \ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) #define DVA_SET_OFFSET(dva, x) \ BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) #define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) #define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) #define BP_GET_LSIZE(bp) \ (BP_IS_EMBEDDED(bp) ? \ (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \ BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)) #define BP_SET_LSIZE(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, \ 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ } while (0) #define BP_GET_PSIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)) #define BP_SET_PSIZE(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, \ 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ } while (0) #define BP_GET_COMPRESS(bp) \ BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) #define BP_SET_COMPRESS(bp, x) \ BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) #define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) #define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) #define BP_GET_CHECKSUM(bp) \ (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \ BF64_GET((bp)->blk_prop, 40, 8)) #define BP_SET_CHECKSUM(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop, 40, 8, x); \ } while (0) #define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) #define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) #define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) #define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) /* encrypted, authenticated, and MAC cksum bps use the same bit */ #define BP_USES_CRYPT(bp) BF64_GET((bp)->blk_prop, 61, 1) #define BP_SET_CRYPT(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x) #define BP_IS_ENCRYPTED(bp) \ (BP_USES_CRYPT(bp) && \ BP_GET_LEVEL(bp) <= 0 && \ DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) #define BP_IS_AUTHENTICATED(bp) \ (BP_USES_CRYPT(bp) && \ BP_GET_LEVEL(bp) <= 0 && \ !DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) #define BP_HAS_INDIRECT_MAC_CKSUM(bp) \ (BP_USES_CRYPT(bp) && BP_GET_LEVEL(bp) > 0) #define BP_IS_PROTECTED(bp) \ (BP_IS_ENCRYPTED(bp) || BP_IS_AUTHENTICATED(bp)) #define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) #define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) #define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1) #define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) #define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1) #define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x) /* * Block birth time macros for different use cases: * - BP_GET_LOGICAL_BIRTH(): When the block was logically modified by user. * To be used with a focus on user data, like incremental replication. * - BP_GET_PHYSICAL_BIRTH(): When the block was physically written to disks. * For regular writes is equal to logical birth. For dedup and block cloning * can be smaller than logical birth. For remapped and rewritten blocks can * be bigger. To be used with focus on physical disk content: ARC, DDT, scrub. * - BP_GET_RAW_PHYSICAL_BIRTH(): Raw physical birth value. Zero if equal * to logical birth. Should only be used for BP copying and debugging. * - BP_GET_BIRTH(): When the block was allocated, which is a physical birth * for rewritten blocks (rewrite flag set) or logical birth otherwise. */ #define BP_GET_LOGICAL_BIRTH(bp) (bp)->blk_birth_word[1] #define BP_SET_LOGICAL_BIRTH(bp, x) ((bp)->blk_birth_word[1] = (x)) #define BP_GET_RAW_PHYSICAL_BIRTH(bp) (bp)->blk_birth_word[0] #define BP_SET_PHYSICAL_BIRTH(bp, x) ((bp)->blk_birth_word[0] = (x)) #define BP_GET_PHYSICAL_BIRTH(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ BP_GET_RAW_PHYSICAL_BIRTH(bp) ? BP_GET_RAW_PHYSICAL_BIRTH(bp) : \ BP_GET_LOGICAL_BIRTH(bp)) #define BP_GET_BIRTH(bp) \ ((BP_IS_EMBEDDED(bp) || !BP_GET_REWRITE(bp)) ? \ BP_GET_LOGICAL_BIRTH(bp) : BP_GET_PHYSICAL_BIRTH(bp)) #define BP_SET_BIRTH(bp, logical, physical) \ { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BP_SET_LOGICAL_BIRTH(bp, logical); \ BP_SET_PHYSICAL_BIRTH(bp, \ ((logical) == (physical) ? 0 : (physical))); \ } #define BP_GET_FILL(bp) \ (BP_IS_EMBEDDED(bp) ? 1 : \ BP_IS_ENCRYPTED(bp) ? BF64_GET((bp)->blk_fill, 0, 32) : \ (bp)->blk_fill) #define BP_SET_FILL(bp, fill) \ { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ if (BP_IS_ENCRYPTED(bp)) \ BF64_SET((bp)->blk_fill, 0, 32, fill); \ else \ (bp)->blk_fill = fill; \ } #define BP_GET_IV2(bp) \ (ASSERT(BP_IS_ENCRYPTED(bp)), \ BF64_GET((bp)->blk_fill, 32, 32)) #define BP_SET_IV2(bp, iv2) \ { \ ASSERT(BP_IS_ENCRYPTED(bp)); \ BF64_SET((bp)->blk_fill, 32, 32, iv2); \ } #define BP_GET_REWRITE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : BF64_GET((bp)->blk_prop2, 63, 1)) #define BP_SET_REWRITE(bp, x) \ { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop2, 63, 1, x); \ } #define BP_IS_METADATA(bp) \ (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) #define BP_GET_ASIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ (DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) #define BP_GET_UCSIZE(bp) \ (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ (!!DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) #define BP_COUNT_GANG(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ DVA_GET_GANG(&(bp)->blk_dva[1]) + \ (DVA_GET_GANG(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp)))) #define DVA_EQUAL(dva1, dva2) \ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ (dva1)->dva_word[0] == (dva2)->dva_word[0]) #define BP_EQUAL(bp1, bp2) \ (BP_GET_PHYSICAL_BIRTH(bp1) == BP_GET_PHYSICAL_BIRTH(bp2) && \ BP_GET_LOGICAL_BIRTH(bp1) == BP_GET_LOGICAL_BIRTH(bp2) && \ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) #define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) #define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0]) #define BP_IS_GANG(bp) \ (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp))) #define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \ (dva)->dva_word[1] == 0ULL) #define BP_IS_HOLE(bp) \ (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp))) #define BP_SET_REDACTED(bp) \ { \ BP_SET_EMBEDDED(bp, B_TRUE); \ BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_REDACTED); \ } #define BP_IS_REDACTED(bp) \ (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_REDACTED) /* BP_IS_RAIDZ(bp) assumes no block compression */ #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ BP_GET_PSIZE(bp)) #define BP_ZERO_DVAS(bp) \ { \ (bp)->blk_dva[0].dva_word[0] = 0; \ (bp)->blk_dva[0].dva_word[1] = 0; \ (bp)->blk_dva[1].dva_word[0] = 0; \ (bp)->blk_dva[1].dva_word[1] = 0; \ (bp)->blk_dva[2].dva_word[0] = 0; \ (bp)->blk_dva[2].dva_word[1] = 0; \ } #define BP_ZERO(bp) \ { \ BP_ZERO_DVAS(bp); \ (bp)->blk_prop = 0; \ (bp)->blk_prop2 = 0; \ (bp)->blk_pad = 0; \ (bp)->blk_birth_word[0] = 0; \ (bp)->blk_birth_word[1] = 0; \ (bp)->blk_fill = 0; \ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ } #ifdef _ZFS_BIG_ENDIAN #define ZFS_HOST_BYTEORDER (0ULL) #else #define ZFS_HOST_BYTEORDER (1ULL) #endif #define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) #define BP_SPRINTF_LEN 400 /* * This macro allows code sharing between zfs, libzpool, and mdb. * 'func' is either kmem_scnprintf() or mdb_snprintf(). * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. */ #define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \ { \ static const char *const copyname[] = \ { "zero", "single", "double", "triple" }; \ int len = 0; \ int copies = 0; \ const char *crypt_type; \ if (bp != NULL) { \ if (BP_IS_ENCRYPTED(bp)) { \ crypt_type = "encrypted"; \ /* LINTED E_SUSPICIOUS_COMPARISON */ \ } else if (BP_IS_AUTHENTICATED(bp)) { \ crypt_type = "authenticated"; \ } else if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { \ crypt_type = "indirect-MAC"; \ } else { \ crypt_type = "unencrypted"; \ } \ } \ if (bp == NULL) { \ len += func(buf + len, size - len, ""); \ } else if (BP_IS_HOLE(bp)) { \ len += func(buf + len, size - len, \ "HOLE [L%llu %s] " \ "size=%llxL birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); \ } else if (BP_IS_EMBEDDED(bp)) { \ len = func(buf + len, size - len, \ "EMBEDDED [L%llu %s] et=%u %s " \ "size=%llxL/%llxP birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (int)BPE_GET_ETYPE(bp), \ compress, \ (u_longlong_t)BPE_GET_LSIZE(bp), \ (u_longlong_t)BPE_GET_PSIZE(bp), \ (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); \ } else if (BP_IS_REDACTED(bp)) { \ len += func(buf + len, size - len, \ "REDACTED [L%llu %s] size=%llxL birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); \ } else { \ for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ const dva_t *dva = &bp->blk_dva[d]; \ if (DVA_IS_VALID(dva)) \ copies++; \ len += func(buf + len, size - len, \ "DVA[%d]=<%llu:%llx:%llx>%c", d, \ (u_longlong_t)DVA_GET_VDEV(dva), \ (u_longlong_t)DVA_GET_OFFSET(dva), \ (u_longlong_t)DVA_GET_ASIZE(dva), \ ws); \ } \ if (BP_IS_ENCRYPTED(bp)) { \ len += func(buf + len, size - len, \ "salt=%llx iv=%llx:%llx%c", \ (u_longlong_t)bp->blk_dva[2].dva_word[0], \ (u_longlong_t)bp->blk_dva[2].dva_word[1], \ (u_longlong_t)BP_GET_IV2(bp), \ ws); \ } \ len += func(buf + len, size - len, \ "[L%llu %s] %s %s %s %s %s %s %s%c" \ "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ "cksum=%016llx:%016llx:%016llx:%016llx", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ checksum, \ compress, \ crypt_type, \ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ BP_IS_GANG(bp) ? "gang" : "contiguous", \ BP_GET_DEDUP(bp) ? "dedup" : "unique", \ copyname[copies], \ ws, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)BP_GET_PSIZE(bp), \ (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp), \ (u_longlong_t)BP_GET_PHYSICAL_BIRTH(bp), \ (u_longlong_t)BP_GET_FILL(bp), \ ws, \ (u_longlong_t)bp->blk_cksum.zc_word[0], \ (u_longlong_t)bp->blk_cksum.zc_word[1], \ (u_longlong_t)bp->blk_cksum.zc_word[2], \ (u_longlong_t)bp->blk_cksum.zc_word[3]); \ } \ ASSERT(len < size); \ } #define BP_GET_BUFC_TYPE(bp) \ (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) typedef enum spa_import_type { SPA_IMPORT_EXISTING, SPA_IMPORT_ASSEMBLE } spa_import_type_t; typedef enum spa_mode { SPA_MODE_UNINIT = 0, SPA_MODE_READ = 1, SPA_MODE_WRITE = 2, } spa_mode_t; /* * Send TRIM commands in-line during normal pool operation while deleting. * OFF: no * ON: yes */ typedef enum { SPA_AUTOTRIM_OFF = 0, /* default */ SPA_AUTOTRIM_ON, } spa_autotrim_t; /* * Reason TRIM command was issued, used internally for accounting purposes. */ typedef enum trim_type { TRIM_TYPE_MANUAL = 0, TRIM_TYPE_AUTO = 1, TRIM_TYPE_SIMPLE = 2 } trim_type_t; /* state manipulation functions */ extern int spa_open(const char *pool, spa_t **, const void *tag); extern int spa_open_rewind(const char *pool, spa_t **, const void *tag, nvlist_t *policy, nvlist_t **config); extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, size_t buflen); extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops, struct dsl_crypto_params *dcp); extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(const char *pool); extern int spa_checkpoint(const char *pool); extern int spa_checkpoint_discard(const char *pool); extern int spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce); extern int spa_reset(const char *pool); extern void spa_async_request(spa_t *spa, int flag); extern void spa_async_unrequest(spa_t *spa, int flag); extern void spa_async_suspend(spa_t *spa); extern void spa_async_resume(spa_t *spa); extern int spa_async_tasks(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); extern void spa_scan_stat_init(spa_t *spa); extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 #define SPA_ASYNC_FAULT_VDEV 0x04 #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_AUTOEXPAND 0x20 #define SPA_ASYNC_REMOVE_DONE 0x40 #define SPA_ASYNC_REMOVE_STOP 0x80 #define SPA_ASYNC_INITIALIZE_RESTART 0x100 #define SPA_ASYNC_TRIM_RESTART 0x200 #define SPA_ASYNC_AUTOTRIM_RESTART 0x400 #define SPA_ASYNC_L2CACHE_REBUILD 0x800 #define SPA_ASYNC_L2CACHE_TRIM 0x1000 #define SPA_ASYNC_REBUILD_DONE 0x2000 #define SPA_ASYNC_DETACH_SPARE 0x4000 #define SPA_ASYNC_REMOVE_BY_USER 0x8000 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t ashift_check); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, int rebuild); extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_alloc(spa_t *spa, uint64_t guid); extern int spa_vdev_noalloc(spa_t *spa, uint64_t guid); extern boolean_t spa_vdev_remove_active(spa_t *spa); extern int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, nvlist_t *vdev_errlist); extern int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); extern int spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp); /* spare state (which is global across all pools) */ extern void spa_spare_add(vdev_t *vd); extern void spa_spare_remove(vdev_t *vd); extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt); extern void spa_spare_activate(vdev_t *vd); /* L2ARC state (which is global across all pools) */ extern void spa_l2cache_add(vdev_t *vd); extern void spa_l2cache_remove(vdev_t *vd); extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); extern void spa_l2cache_activate(vdev_t *vd); extern void spa_l2cache_drop(spa_t *spa); /* scanning */ extern int spa_scan(spa_t *spa, pool_scan_func_t func); extern int spa_scan_range(spa_t *spa, pool_scan_func_t func, uint64_t txgstart, uint64_t txgend); extern int spa_scan_stop(spa_t *spa); extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag); /* spa syncing */ extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ extern void spa_sync_allpools(void); extern uint_t zfs_sync_pass_deferred_free; /* spa sync taskqueues */ taskq_t *spa_sync_tq_create(spa_t *spa, const char *name); void spa_sync_tq_destroy(spa_t *spa); uint_t spa_acq_allocator(spa_t *spa); void spa_rel_allocator(spa_t *spa, uint_t allocator); void spa_select_allocator(zio_t *zio); /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; extern avl_tree_t spa_namespace_avl; extern kcondvar_t spa_namespace_cv; /* * SPA configuration functions in spa_config.c */ #define SPA_CONFIG_UPDATE_POOL 0 #define SPA_CONFIG_UPDATE_VDEVS 1 extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t); extern int spa_all_configs(uint64_t *generation, nvlist_t **pools); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); extern void spa_config_update(spa_t *spa, int what); extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype); /* * Miscellaneous SPA routines in spa_misc.c */ /* Namespace manipulation */ extern spa_t *spa_lookup(const char *name); extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); extern void spa_remove(spa_t *spa); extern spa_t *spa_next(spa_t *prev); /* Refcount functions */ extern void spa_open_ref(spa_t *spa, const void *tag); extern void spa_close(spa_t *spa, const void *tag); extern void spa_async_close(spa_t *spa, const void *tag); extern boolean_t spa_refcount_zero(spa_t *spa); #define SCL_NONE 0x00 #define SCL_CONFIG 0x01 #define SCL_STATE 0x02 #define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ #define SCL_ALLOC 0x08 #define SCL_ZIO 0x10 #define SCL_FREE 0x20 #define SCL_VDEV 0x40 #define SCL_LOCKS 7 #define SCL_ALL ((1 << SCL_LOCKS) - 1) #define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) /* Historical pool statistics */ typedef struct spa_history_kstat { kmutex_t lock; uint64_t count; uint64_t size; kstat_t *kstat; void *priv; list_t list; } spa_history_kstat_t; typedef struct spa_history_list { uint64_t size; procfs_list_t procfs_list; } spa_history_list_t; typedef struct spa_stats { spa_history_list_t read_history; spa_history_list_t txg_history; spa_history_kstat_t tx_assign_histogram; spa_history_list_t mmp_history; spa_history_kstat_t state; /* pool state */ spa_history_kstat_t guid; /* pool guid */ spa_history_kstat_t iostats; } spa_stats_t; typedef enum txg_state { TXG_STATE_BIRTH = 0, TXG_STATE_OPEN = 1, TXG_STATE_QUIESCED = 2, TXG_STATE_WAIT_FOR_SYNC = 3, TXG_STATE_SYNCED = 4, TXG_STATE_COMMITTED = 5, } txg_state_t; typedef struct txg_stat { vdev_stat_t vs1; vdev_stat_t vs2; uint64_t txg; uint64_t ndirty; } txg_stat_t; /* Assorted pool IO kstats */ typedef struct spa_iostats { kstat_named_t trim_extents_written; kstat_named_t trim_bytes_written; kstat_named_t trim_extents_skipped; kstat_named_t trim_bytes_skipped; kstat_named_t trim_extents_failed; kstat_named_t trim_bytes_failed; kstat_named_t autotrim_extents_written; kstat_named_t autotrim_bytes_written; kstat_named_t autotrim_extents_skipped; kstat_named_t autotrim_bytes_skipped; kstat_named_t autotrim_extents_failed; kstat_named_t autotrim_bytes_failed; kstat_named_t simple_trim_extents_written; kstat_named_t simple_trim_bytes_written; kstat_named_t simple_trim_extents_skipped; kstat_named_t simple_trim_bytes_skipped; kstat_named_t simple_trim_extents_failed; kstat_named_t simple_trim_bytes_failed; kstat_named_t arc_read_count; kstat_named_t arc_read_bytes; kstat_named_t arc_write_count; kstat_named_t arc_write_bytes; kstat_named_t direct_read_count; kstat_named_t direct_read_bytes; kstat_named_t direct_write_count; kstat_named_t direct_write_bytes; } spa_iostats_t; extern void spa_stats_init(spa_t *spa); extern void spa_stats_destroy(spa_t *spa); extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags); extern void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time); extern int spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, hrtime_t completed_time); extern txg_stat_t *spa_txg_history_init_io(spa_t *, uint64_t, struct dsl_pool *); extern void spa_txg_history_fini_io(spa_t *, txg_stat_t *); extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs); extern int spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id); extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error, hrtime_t duration); extern void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id, int error); extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type, uint64_t extents_written, uint64_t bytes_written, uint64_t extents_skipped, uint64_t bytes_skipped, uint64_t extents_failed, uint64_t bytes_failed); extern void spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags); extern void spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags); extern void spa_import_progress_add(spa_t *spa); extern void spa_import_progress_remove(uint64_t spa_guid); extern int spa_import_progress_set_mmp_check(uint64_t pool_guid, uint64_t mmp_sec_remaining); extern int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t max_txg); extern int spa_import_progress_set_state(uint64_t pool_guid, spa_load_state_t spa_load_state); extern void spa_import_progress_set_notes(spa_t *spa, const char *fmt, ...) __printflike(2, 3); extern void spa_import_progress_set_notes_nolog(spa_t *spa, const char *fmt, ...) __printflike(2, 3); /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_exit(spa_t *spa, int locks, const void *tag); extern int spa_config_held(spa_t *spa, int locks, krw_t rw); /* Pool vdev add/remove lock */ extern uint64_t spa_vdev_enter(spa_t *spa); extern uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid); extern uint64_t spa_vdev_config_enter(spa_t *spa); extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, const char *tag); extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); /* Pool vdev state change lock */ extern void spa_vdev_state_enter(spa_t *spa, int oplock); extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); /* Log state */ typedef enum spa_log_state { SPA_LOG_UNKNOWN = 0, /* unknown log state */ SPA_LOG_MISSING, /* missing log(s) */ SPA_LOG_CLEAR, /* clear the log(s) */ SPA_LOG_GOOD, /* log(s) are good */ } spa_log_state_t; extern spa_log_state_t spa_get_log_state(spa_t *spa); extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); extern int spa_reset_logs(spa_t *spa); /* Log claim callback */ extern void spa_claim_notify(zio_t *zio); extern void spa_deadman(void *); /* Accessor functions */ extern boolean_t spa_shutting_down(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); extern boolean_t spa_is_initializing(spa_t *spa); extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa); extern blkptr_t *spa_get_rootblkptr(spa_t *spa); extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); extern void spa_altroot(spa_t *, char *, size_t); extern uint32_t spa_sync_pass(spa_t *spa); extern char *spa_name(spa_t *spa); extern uint64_t spa_guid(spa_t *spa); extern uint64_t spa_load_guid(spa_t *spa); extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); extern uint64_t spa_syncing_txg(spa_t *spa); extern uint64_t spa_final_dirty_txg(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); +extern void spa_get_min_alloc_range(spa_t *spa, uint64_t *min, uint64_t *max); extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_checkpoint_space(spa_t *spa); extern uint64_t spa_get_slop_space(spa_t *spa); extern void spa_update_dspace(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern boolean_t spa_deflate(spa_t *spa); extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); extern metaslab_class_t *spa_embedded_log_class(spa_t *spa); extern metaslab_class_t *spa_special_class(spa_t *spa); extern metaslab_class_t *spa_special_embedded_log_class(spa_t *spa); extern metaslab_class_t *spa_dedup_class(spa_t *spa); extern metaslab_class_t *spa_preferred_class(spa_t *spa, const zio_t *zio); extern boolean_t spa_special_has_ddt(spa_t *spa); extern void spa_evicting_os_register(spa_t *, objset_t *os); extern void spa_evicting_os_deregister(spa_t *, objset_t *os); extern void spa_evicting_os_wait(spa_t *spa); extern int spa_max_replication(spa_t *spa); extern int spa_prev_software_version(spa_t *spa); extern uint64_t spa_get_failmode(spa_t *spa); extern uint64_t spa_get_deadman_failmode(spa_t *spa); extern void spa_set_deadman_failmode(spa_t *spa, const char *failmode); extern boolean_t spa_suspended(spa_t *spa); extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_get_last_scrubbed_txg(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); extern space_map_t *spa_syncing_log_sm(spa_t *spa); extern uint64_t spa_deadman_synctime(spa_t *spa); extern uint64_t spa_deadman_ziotime(spa_t *spa); extern uint64_t spa_dirty_data(spa_t *spa); extern spa_autotrim_t spa_get_autotrim(spa_t *spa); extern int spa_get_allocator(spa_t *spa); extern void spa_set_allocator(spa_t *spa, const char *allocator); /* Miscellaneous support routines */ extern void spa_load_failed(spa_t *spa, const char *fmt, ...) __attribute__((format(printf, 2, 3))); extern void spa_load_note(spa_t *spa, const char *fmt, ...) __attribute__((format(printf, 2, 3))); extern void spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx); extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature); extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); extern char *spa_strdup(const char *); extern void spa_strfree(char *); extern uint64_t spa_generate_guid(spa_t *spa); extern uint64_t spa_generate_load_guid(void); extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); extern int spa_change_guid(spa_t *spa, const uint64_t *guidp); extern void spa_upgrade(spa_t *spa, uint64_t version); extern void spa_evict_all(void); extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache); extern boolean_t spa_has_l2cache(spa_t *, uint64_t guid); extern boolean_t spa_has_spare(spa_t *, uint64_t guid); extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); extern boolean_t spa_has_dedup(spa_t *spa); extern boolean_t spa_has_slogs(spa_t *spa); extern boolean_t spa_has_special(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa); extern boolean_t spa_writeable(spa_t *spa); extern boolean_t spa_has_pending_synctask(spa_t *spa); extern int spa_maxblocksize(spa_t *spa); extern int spa_maxdnodesize(spa_t *spa); extern boolean_t spa_has_checkpoint(spa_t *spa); extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa); extern boolean_t spa_suspend_async_destroy(spa_t *spa); extern uint64_t spa_min_claim_txg(spa_t *spa); extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp); typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size, void *arg); extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg); extern uint64_t spa_get_last_removal_txg(spa_t *spa); extern boolean_t spa_trust_config(spa_t *spa); extern uint64_t spa_missing_tvds_allowed(spa_t *spa); extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); extern uint64_t spa_total_metaslabs(spa_t *spa); extern boolean_t spa_multihost(spa_t *spa); extern uint32_t spa_get_hostid(spa_t *spa); extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); extern boolean_t spa_livelist_delete_check(spa_t *spa); extern boolean_t spa_mmp_remote_host_activity(spa_t *spa); extern spa_mode_t spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); extern char *spa_his_ievent_table[]; extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, char *his_buf); extern int spa_history_log(spa_t *spa, const char *his_buf); extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl); extern void spa_history_log_version(spa_t *spa, const char *operation, dmu_tx_t *tx); extern void spa_history_log_internal(spa_t *spa, const char *operation, dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op, dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); extern const char *spa_state_to_name(spa_t *spa); /* error handling */ struct zbookmark_phys; extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t birth); extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, uint64_t birth); extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t state); extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd, zio_t *zio); extern void zfs_ereport_taskq_fini(void); extern void zfs_ereport_clear(spa_t *spa, vdev_t *vd); extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, nvlist_t *aux); extern void zfs_post_remove(spa_t *spa, vdev_t *vd, boolean_t by_kernel); extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_approx_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count); extern uint64_t spa_get_last_errlog_size(spa_t *spa); extern void spa_errlog_rotate(spa_t *spa); extern void spa_errlog_drain(spa_t *spa); extern void spa_errlog_sync(spa_t *spa, uint64_t txg); extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); extern void spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx); extern void spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds, dmu_tx_t *tx); extern void sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx); extern void spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx); extern int find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, uint64_t *top_affected_fs); extern int find_birth_txg(struct dsl_dataset *ds, zbookmark_err_phys_t *zep, uint64_t *birth_txg); extern void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb); extern void name_to_errphys(char *buf, zbookmark_err_phys_t *zep); /* vdev mirror */ extern void vdev_mirror_stat_init(void); extern void vdev_mirror_stat_fini(void); /* Initialization and termination */ extern void spa_init(spa_mode_t mode); extern void spa_fini(void); /* properties */ extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); extern int spa_prop_get(spa_t *spa, nvlist_t *nvp); extern int spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props, nvlist_t *outnvl); extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); /* asynchronous event notification */ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl, const char *name); extern void zfs_ereport_zvol_post(const char *subclass, const char *name, const char *device_name, const char *raw_name); /* waiting for pool activities to complete */ extern int spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited); extern int spa_wait_tag(const char *name, zpool_wait_activity_t activity, uint64_t tag, boolean_t *waited); extern void spa_notify_waiters(spa_t *spa); extern void spa_wake_waiters(spa_t *spa); extern void spa_import_os(spa_t *spa); extern void spa_export_os(spa_t *spa); extern void spa_activate_os(spa_t *spa); extern void spa_deactivate_os(spa_t *spa); /* module param call functions */ int param_set_deadman_ziotime(ZFS_MODULE_PARAM_ARGS); int param_set_deadman_synctime(ZFS_MODULE_PARAM_ARGS); int param_set_slop_shift(ZFS_MODULE_PARAM_ARGS); int param_set_deadman_failmode(ZFS_MODULE_PARAM_ARGS); int param_set_active_allocator(ZFS_MODULE_PARAM_ARGS); #ifdef ZFS_DEBUG #define dprintf_bp(bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ } \ } while (0) #else #define dprintf_bp(bp, fmt, ...) #endif extern spa_mode_t spa_mode_global; extern int zfs_deadman_enabled; extern uint64_t zfs_deadman_synctime_ms; extern uint64_t zfs_deadman_ziotime_ms; extern uint64_t zfs_deadman_checktime_ms; extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; #ifdef __cplusplus } #endif #endif /* _SYS_SPA_H */ diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 07a959db3447..62b062984d36 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -1,509 +1,510 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019 Datto Inc. */ #ifndef _SYS_SPA_IMPL_H #define _SYS_SPA_IMPL_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_crrd.h" #ifdef __cplusplus extern "C" { #endif typedef struct spa_allocs_use { kmutex_t sau_lock; uint_t sau_rotor; boolean_t sau_inuse[]; } spa_allocs_use_t; typedef struct spa_error_entry { zbookmark_phys_t se_bookmark; char *se_name; avl_node_t se_avl; zbookmark_err_phys_t se_zep; /* not accounted in avl_find */ } spa_error_entry_t; typedef struct spa_history_phys { uint64_t sh_pool_create_len; /* ending offset of zpool create */ uint64_t sh_phys_max_off; /* physical EOF */ uint64_t sh_bof; /* logical BOF */ uint64_t sh_eof; /* logical EOF */ uint64_t sh_records_lost; /* num of records overwritten */ } spa_history_phys_t; /* * All members must be uint64_t, for byteswap purposes. */ typedef struct spa_removing_phys { uint64_t sr_state; /* dsl_scan_state_t */ /* * The vdev ID that we most recently attempted to remove, * or -1 if no removal has been attempted. */ uint64_t sr_removing_vdev; /* * The vdev ID that we most recently successfully removed, * or -1 if no devices have been removed. */ uint64_t sr_prev_indirect_vdev; uint64_t sr_start_time; uint64_t sr_end_time; /* * Note that we can not use the space map's or indirect mapping's * accounting as a substitute for these values, because we need to * count frees of not-yet-copied data as though it did the copy. * Otherwise, we could get into a situation where copied > to_copy, * or we complete before copied == to_copy. */ uint64_t sr_to_copy; /* bytes that need to be copied */ uint64_t sr_copied; /* bytes that have been copied or freed */ } spa_removing_phys_t; /* * This struct is stored as an entry in the DMU_POOL_DIRECTORY_OBJECT * (with key DMU_POOL_CONDENSING_INDIRECT). It is present if a condense * of an indirect vdev's mapping object is in progress. */ typedef struct spa_condensing_indirect_phys { /* * The vdev ID of the indirect vdev whose indirect mapping is * being condensed. */ uint64_t scip_vdev; /* * The vdev's old obsolete spacemap. This spacemap's contents are * being integrated into the new mapping. */ uint64_t scip_prev_obsolete_sm_object; /* * The new mapping object that is being created. */ uint64_t scip_next_mapping_object; } spa_condensing_indirect_phys_t; struct spa_aux_vdev { uint64_t sav_object; /* MOS object for device list */ nvlist_t *sav_config; /* cached device config */ vdev_t **sav_vdevs; /* devices */ int sav_count; /* number devices */ boolean_t sav_sync; /* sync the device list */ boolean_t sav_label_sync; /* sync aux labels */ nvlist_t **sav_pending; /* pending device additions */ uint_t sav_npending; /* # pending devices */ }; typedef struct spa_config_lock { kmutex_t scl_lock; kthread_t *scl_writer; int scl_write_wanted; int scl_count; kcondvar_t scl_cv; } ____cacheline_aligned spa_config_lock_t; typedef struct spa_config_dirent { list_node_t scd_link; char *scd_path; } spa_config_dirent_t; typedef enum zio_taskq_type { ZIO_TASKQ_ISSUE = 0, ZIO_TASKQ_ISSUE_HIGH, ZIO_TASKQ_INTERRUPT, ZIO_TASKQ_INTERRUPT_HIGH, ZIO_TASKQ_TYPES } zio_taskq_type_t; /* * State machine for the zpool-poolname process. The states transitions * are done as follows: * * From To Routine * PROC_NONE -> PROC_CREATED spa_activate() * PROC_CREATED -> PROC_ACTIVE spa_thread() * PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate() * PROC_DEACTIVATE -> PROC_GONE spa_thread() * PROC_GONE -> PROC_NONE spa_deactivate() */ typedef enum spa_proc_state { SPA_PROC_NONE, /* spa_proc = &p0, no process created */ SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */ SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */ SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */ SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */ } spa_proc_state_t; typedef struct spa_taskqs { uint_t stqs_count; taskq_t **stqs_taskq; } spa_taskqs_t; /* one for each thread in the spa sync taskq */ typedef struct spa_syncthread_info { kthread_t *sti_thread; uint_t sti_allocator; } spa_syncthread_info_t; typedef enum spa_all_vdev_zap_action { AVZ_ACTION_NONE = 0, AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */ AVZ_ACTION_REBUILD, /* Populate the new AVZ, see spa_avz_rebuild */ AVZ_ACTION_INITIALIZE } spa_avz_action_t; typedef enum spa_config_source { SPA_CONFIG_SRC_NONE = 0, SPA_CONFIG_SRC_SCAN, /* scan of path (default: /dev/dsk) */ SPA_CONFIG_SRC_CACHEFILE, /* any cachefile */ SPA_CONFIG_SRC_TRYIMPORT, /* returned from call to tryimport */ SPA_CONFIG_SRC_SPLIT, /* new pool in a pool split */ SPA_CONFIG_SRC_MOS /* MOS, but not always from right txg */ } spa_config_source_t; struct spa { /* * Fields protected by spa_namespace_lock. */ char spa_name[ZFS_MAX_DATASET_NAME_LEN]; /* pool name */ char *spa_comment; /* comment */ avl_node_t spa_avl; /* node in spa_namespace_avl */ nvlist_t *spa_config; /* last synced config */ nvlist_t *spa_config_syncing; /* currently syncing config */ nvlist_t *spa_config_splitting; /* config for splitting */ nvlist_t *spa_load_info; /* info and errors from load */ uint64_t spa_config_txg; /* txg of last config change */ uint32_t spa_sync_pass; /* iterate-to-convergence */ pool_state_t spa_state; /* pool state */ int spa_inject_ref; /* injection references */ uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */ boolean_t spa_trust_config; /* do we trust vdev tree? */ boolean_t spa_is_splitting; /* in the middle of a split? */ spa_config_source_t spa_config_source; /* where config comes from? */ uint64_t spa_import_flags; /* import specific flags */ spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; boolean_t spa_is_initializing; /* true while opening pool */ boolean_t spa_is_exporting; /* true while exporting pool */ kthread_t *spa_export_thread; /* valid during pool export */ kthread_t *spa_load_thread; /* loading, no namespace lock */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */ metaslab_class_t *spa_special_class; /* special allocation class */ metaslab_class_t *spa_special_embedded_log_class; /* log on special */ metaslab_class_t *spa_dedup_class; /* dedup allocation class */ uint64_t spa_first_txg; /* first txg after spa_open() */ uint64_t spa_final_txg; /* txg of export/destroy */ uint64_t spa_freeze_txg; /* freeze pool at this txg */ uint64_t spa_load_max_txg; /* best initial ub_txg */ uint64_t spa_claim_max_txg; /* highest claimed birth txg */ inode_timespec_t spa_loaded_ts; /* 1st successful open time */ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */ list_t spa_evicting_os_list; /* Objsets being evicted. */ kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ uint64_t spa_min_ashift; /* of vdevs in normal class */ uint64_t spa_max_ashift; /* of vdevs in normal class */ uint64_t spa_min_alloc; /* of vdevs in normal class */ + uint64_t spa_max_alloc; /* of vdevs in normal class */ uint64_t spa_gcd_alloc; /* of vdevs in normal class */ uint64_t spa_config_guid; /* config pool guid */ uint64_t spa_load_guid; /* spa_load initialized guid */ uint64_t spa_last_synced_guid; /* last synced guid */ list_t spa_config_dirty_list; /* vdevs with dirty config */ list_t spa_state_dirty_list; /* vdevs with dirty state */ spa_allocs_use_t *spa_allocs_use; int spa_alloc_count; int spa_active_allocator; /* selectable allocator */ /* per-allocator sync thread taskqs */ taskq_t *spa_sync_tq; spa_syncthread_info_t *spa_syncthreads; spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ boolean_t spa_aux_sync_uber; /* need to sync aux uber */ nvlist_t *spa_label_features; /* Features for reading MOS */ uint64_t spa_config_object; /* MOS object for pool config */ uint64_t spa_config_generation; /* config generation number */ uint64_t spa_syncing_txg; /* txg currently syncing */ bpobj_t spa_deferred_bpobj; /* deferred-free bplist */ bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */ zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */ /* checksum context templates */ kmutex_t spa_cksum_tmpls_lock; void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS]; uberblock_t spa_ubsync; /* last synced uberblock */ uberblock_t spa_uberblock; /* current uberblock */ boolean_t spa_extreme_rewind; /* rewind past deferred frees */ kmutex_t spa_scrub_lock; /* resilver/scrub lock */ uint64_t spa_scrub_inflight; /* in-flight scrub bytes */ /* in-flight verification bytes */ uint64_t spa_load_verify_bytes; kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ uint8_t spa_scrub_active; /* active or suspended? */ uint8_t spa_scrub_type; /* type of scrub we're doing */ uint8_t spa_scrub_finished; /* indicator to rotate logs */ uint8_t spa_scrub_started; /* started since last boot */ uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */ uint64_t spa_scan_pass_start; /* start time per pass/reboot */ uint64_t spa_scan_pass_scrub_pause; /* scrub pause time */ uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */ uint64_t spa_scan_pass_exam; /* examined bytes per pass */ uint64_t spa_scan_pass_issued; /* issued bytes per pass */ uint64_t spa_scrubbed_last_txg; /* last txg scrubbed */ /* error scrub pause time in milliseconds */ uint64_t spa_scan_pass_errorscrub_pause; /* total error scrub paused time in milliseconds */ uint64_t spa_scan_pass_errorscrub_spent_paused; /* * We are in the middle of a resilver, and another resilver * is needed once this one completes. This is set iff any * vdev_resilver_deferred is set. */ boolean_t spa_resilver_deferred; kmutex_t spa_async_lock; /* protect async state */ kthread_t *spa_async_thread; /* thread doing async task */ int spa_async_suspended; /* async tasks suspended */ kcondvar_t spa_async_cv; /* wait for thread_exit() */ uint16_t spa_async_tasks; /* async task mask */ uint64_t spa_missing_tvds; /* unopenable tvds on load */ uint64_t spa_missing_tvds_allowed; /* allow loading spa? */ uint64_t spa_nonallocating_dspace; spa_removing_phys_t spa_removing_phys; spa_vdev_removal_t *spa_vdev_removal; spa_condensing_indirect_phys_t spa_condensing_indirect_phys; spa_condensing_indirect_t *spa_condensing_indirect; zthr_t *spa_condense_zthr; /* zthr doing condense. */ vdev_raidz_expand_t *spa_raidz_expand; zthr_t *spa_raidz_expand_zthr; uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */ spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ zthr_t *spa_checkpoint_discard_zthr; kmutex_t spa_txg_log_time_lock; /* for spa_txg_log_time */ dbrrd_t spa_txg_log_time; uint64_t spa_last_noted_txg; uint64_t spa_last_noted_txg_time; uint64_t spa_last_flush_txg_time; space_map_t *spa_syncing_log_sm; /* current log space map */ avl_tree_t spa_sm_logs_by_txg; kmutex_t spa_flushed_ms_lock; /* for metaslabs_by_flushed */ avl_tree_t spa_metaslabs_by_flushed; spa_unflushed_stats_t spa_unflushed_stats; list_t spa_log_summary; uint64_t spa_log_flushall_txg; zthr_t *spa_livelist_delete_zthr; /* deleting livelists */ zthr_t *spa_livelist_condense_zthr; /* condensing livelists */ uint64_t spa_livelists_to_delete; /* set of livelists to free */ livelist_condense_entry_t spa_to_condense; /* next to condense */ char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ int spa_last_open_failed; /* error if last open failed */ uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */ uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */ uint64_t spa_load_txg; /* ub txg that loaded */ uint64_t spa_load_txg_ts; /* timestamp from that ub */ uint64_t spa_load_meta_errors; /* verify metadata err count */ uint64_t spa_load_data_errors; /* verify data err count */ uint64_t spa_verify_min_txg; /* start txg of verify scrub */ kmutex_t spa_errlog_lock; /* error log lock */ uint64_t spa_errlog_last; /* last error log object */ uint64_t spa_errlog_scrub; /* scrub error log object */ kmutex_t spa_errlist_lock; /* error list/ereport lock */ avl_tree_t spa_errlist_last; /* last error list */ avl_tree_t spa_errlist_scrub; /* scrub error list */ avl_tree_t spa_errlist_healed; /* list of healed blocks */ uint64_t spa_deflate; /* should we deflate? */ uint64_t spa_history; /* history object */ kmutex_t spa_history_lock; /* history lock */ vdev_t *spa_pending_vdev; /* pending vdev additions */ kmutex_t spa_props_lock; /* property lock */ uint64_t spa_pool_props_object; /* object for properties */ uint64_t spa_bootfs; /* default boot filesystem */ uint64_t spa_failmode; /* failure mode for the pool */ uint64_t spa_deadman_failmode; /* failure mode for deadman */ uint64_t spa_delegation; /* delegation on/off */ list_t spa_config_list; /* previous cache file(s) */ /* per-CPU array of root of async I/O: */ zio_t **spa_async_zio_root; zio_t *spa_suspend_zio_root; /* root of all suspended I/O */ zio_t *spa_txg_zio[TXG_SIZE]; /* spa_sync() waits for this */ kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ kcondvar_t spa_suspend_cv; /* notification of resume */ zio_suspend_reason_t spa_suspended; /* pool is suspended */ uint8_t spa_claiming; /* pool is doing zil_claim() */ boolean_t spa_is_root; /* pool is root */ int spa_minref; /* num refs when first opened */ spa_mode_t spa_mode; /* SPA_MODE_{READ|WRITE} */ boolean_t spa_read_spacemaps; /* spacemaps available if ro */ spa_log_state_t spa_log_state; /* log state */ uint64_t spa_autoexpand; /* lun expansion on/off */ ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */ uint64_t spa_ddt_stat_object; /* DDT statistics */ uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ uint64_t spa_rdspace; /* raw (non-dedup) --//-- */ boolean_t spa_active_ddt_prune; /* ddt prune process active */ brt_vdev_t **spa_brt_vdevs; /* array of per-vdev BRTs */ uint64_t spa_brt_nvdevs; /* number of vdevs in BRT */ uint64_t spa_brt_rangesize; /* pool's BRT range size */ krwlock_t spa_brt_lock; /* Protects brt_vdevs/nvdevs */ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ kmutex_t spa_proc_lock; /* protects spa_proc* */ kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ spa_proc_state_t spa_proc_state; /* see definition */ proc_t *spa_proc; /* "zpool-poolname" process */ uintptr_t spa_did; /* if procp != p0, did of t1 */ boolean_t spa_autoreplace; /* autoreplace set in open */ int spa_vdev_locks; /* locks grabbed */ uint64_t spa_creation_version; /* version at pool creation */ uint64_t spa_prev_software_version; /* See ub_software_version */ uint64_t spa_feat_for_write_obj; /* required to write to pool */ uint64_t spa_feat_for_read_obj; /* required to read from pool */ uint64_t spa_feat_desc_obj; /* Feature descriptions */ uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */ kmutex_t spa_feat_stats_lock; /* protects spa_feat_stats */ nvlist_t *spa_feat_stats; /* Cache of enabled features */ /* cache feature refcounts */ uint64_t spa_feat_refcount_cache[SPA_FEATURES]; taskqid_t spa_deadman_tqid; /* Task id */ uint64_t spa_deadman_calls; /* number of deadman calls */ hrtime_t spa_sync_starttime; /* starting time of spa_sync */ uint64_t spa_deadman_synctime; /* deadman sync expiration */ uint64_t spa_deadman_ziotime; /* deadman zio expiration */ uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */ spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */ uint64_t spa_autotrim; /* automatic background trim? */ uint64_t spa_errata; /* errata issues detected */ spa_stats_t spa_stats; /* assorted spa statistics */ spa_keystore_t spa_keystore; /* loaded crypto keys */ /* arc_memory_throttle() parameters during low memory condition */ uint64_t spa_lowmem_page_load; /* memory load during txg */ uint64_t spa_lowmem_last_txg; /* txg window start */ hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */ taskq_t *spa_zvol_taskq; /* Taskq for minor management */ taskq_t *spa_metaslab_taskq; /* Taskq for metaslab preload */ taskq_t *spa_prefetch_taskq; /* Taskq for prefetch threads */ taskq_t *spa_upgrade_taskq; /* Taskq for upgrade jobs */ uint64_t spa_multihost; /* multihost aware (mmp) */ mmp_thread_t spa_mmp; /* multihost mmp thread */ list_t spa_leaf_list; /* list of leaf vdevs */ uint64_t spa_leaf_list_gen; /* track leaf_list changes */ uint32_t spa_hostid; /* cached system hostid */ /* synchronization for threads in spa_wait */ kmutex_t spa_activities_lock; kcondvar_t spa_activities_cv; kcondvar_t spa_waiters_cv; int spa_waiters; /* number of waiting threads */ boolean_t spa_waiters_cancel; /* waiters should return */ char *spa_compatibility; /* compatibility file(s) */ uint64_t spa_dedup_table_quota; /* property DDT maximum size */ uint64_t spa_dedup_dsize; /* cached on-disk size of DDT */ uint64_t spa_dedup_class_full_txg; /* txg dedup class was full */ /* * spa_refcount & spa_config_lock must be the last elements * because zfs_refcount_t changes size based on compilation options. * In order for the MDB module to function correctly, the other * fields must remain in the same location. */ spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ zfs_refcount_t spa_refcount; /* number of opens */ }; extern char *spa_config_path; extern const char *zfs_deadman_failmode; extern uint_t spa_slop_shift; extern void spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, zio_t *zio, boolean_t cutinline); extern void spa_load_spares(spa_t *spa); extern void spa_load_l2cache(spa_t *spa); extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name); extern void spa_event_post(sysevent_t *ev); extern int param_set_deadman_failmode_common(const char *val); extern void spa_set_deadman_synctime(hrtime_t ns); extern void spa_set_deadman_ziotime(hrtime_t ns); extern const char *spa_history_zone(void); extern const char *zfs_active_allocator; extern int param_set_active_allocator_common(const char *val); #ifdef __cplusplus } #endif #endif /* _SYS_SPA_IMPL_H */ diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index cf28955b0c50..f615591e826b 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -1,546 +1,548 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. * Copyright (c) 2021, Colm Buckley */ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif /* * Pool configuration repository. * * Pool configuration is stored as a packed nvlist on the filesystem. When * pools are imported they are added to the /etc/zfs/zpool.cache file and * removed from it when exported. For each cache file, we have a single nvlist * which holds all the configuration information. Pools can also have the * 'cachefile' property set which allows this config to be stored in an * alternate location under the control of external software. * * The kernel independantly maintains an AVL tree of imported pools. See the * "SPA locking" comment in spa.c. Whenever a pool configuration is modified * we call spa_write_cachefile() which walks through all the active pools and * writes the updated configuration to to /etc/zfs/zpool.cache file. */ static uint64_t spa_config_generation = 1; /* * This can be overridden in userland to preserve an alternate namespace for * userland pools when doing testing. */ char *spa_config_path = (char *)ZPOOL_CACHE; static int spa_config_remove(spa_config_dirent_t *dp) { int error = 0; /* * Remove the cache file. If zfs_file_unlink() in not supported by the * platform fallback to truncating the file which is functionally * equivalent. */ error = zfs_file_unlink(dp->scd_path); if (error == EOPNOTSUPP) { int flags = O_RDWR | O_TRUNC; zfs_file_t *fp; error = zfs_file_open(dp->scd_path, flags, 0644, &fp); if (error == 0) { (void) zfs_file_fsync(fp, O_SYNC); (void) zfs_file_close(fp); } } return (error); } static int spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) { size_t buflen; char *buf; int oflags = O_RDWR | O_TRUNC | O_CREAT | O_LARGEFILE; char *temp; int err; zfs_file_t *fp; /* * If the nvlist is empty (NULL), then remove the old cachefile. */ if (nvl == NULL) { err = spa_config_remove(dp); if (err == ENOENT) err = 0; return (err); } /* * Pack the configuration into a buffer. */ buf = fnvlist_pack(nvl, &buflen); temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP); /* * Write the configuration to disk. Due to the complexity involved * in performing a rename and remove from within the kernel the file * is instead truncated and overwritten in place. This way we always * have a consistent view of the data or a zero length file. */ err = zfs_file_open(dp->scd_path, oflags, 0644, &fp); if (err == 0) { err = zfs_file_write(fp, buf, buflen, NULL); if (err == 0) err = zfs_file_fsync(fp, O_SYNC); zfs_file_close(fp); if (err) (void) spa_config_remove(dp); } fnvlist_pack_free(buf, buflen); kmem_free(temp, MAXPATHLEN); return (err); } /* * Synchronize pool configuration to disk. This must be called with the * namespace lock held. Synchronizing the pool cache is typically done after * the configuration has been synced to the MOS. This exposes a window where * the MOS config will have been updated but the cache file has not. If * the system were to crash at that instant then the cached config may not * contain the correct information to open the pool and an explicit import * would be required. */ void spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent, boolean_t postblkidevent) { spa_config_dirent_t *dp, *tdp; nvlist_t *nvl; const char *pool_name; boolean_t ccw_failure; int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (!(spa_mode_global & SPA_MODE_WRITE)) return; /* * Iterate over all cachefiles for the pool, past or present. When the * cachefile is changed, the new one is pushed onto this list, allowing * us to update previous cachefiles that no longer contain this pool. */ ccw_failure = B_FALSE; for (dp = list_head(&target->spa_config_list); dp != NULL; dp = list_next(&target->spa_config_list, dp)) { spa_t *spa = NULL; if (dp->scd_path == NULL) continue; /* * Iterate over all pools, adding any matching pools to 'nvl'. */ nvl = NULL; while ((spa = spa_next(spa)) != NULL) { /* * Skip over our own pool if we're about to remove * ourselves from the spa namespace or any pool that * is readonly. Since we cannot guarantee that a * readonly pool would successfully import upon reboot, * we don't allow them to be written to the cache file. */ if ((spa == target && removing) || !spa_writeable(spa)) continue; mutex_enter(&spa->spa_props_lock); tdp = list_head(&spa->spa_config_list); if (spa->spa_config == NULL || tdp == NULL || tdp->scd_path == NULL || strcmp(tdp->scd_path, dp->scd_path) != 0) { mutex_exit(&spa->spa_props_lock); continue; } if (nvl == NULL) nvl = fnvlist_alloc(); if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) pool_name = fnvlist_lookup_string( spa->spa_config, ZPOOL_CONFIG_POOL_NAME); else pool_name = spa_name(spa); fnvlist_add_nvlist(nvl, pool_name, spa->spa_config); mutex_exit(&spa->spa_props_lock); } error = spa_config_write(dp, nvl); if (error != 0) ccw_failure = B_TRUE; nvlist_free(nvl); } if (ccw_failure) { /* * Keep trying so that configuration data is * written if/when any temporary filesystem * resource issues are resolved. */ if (target->spa_ccw_fail_time == 0) { (void) zfs_ereport_post( FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, target, NULL, NULL, NULL, 0); } target->spa_ccw_fail_time = gethrtime(); spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); } else { /* * Do not rate limit future attempts to update * the config cache. */ target->spa_ccw_fail_time = 0; } /* * Remove any config entries older than the current one. */ dp = list_head(&target->spa_config_list); while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) { list_remove(&target->spa_config_list, tdp); if (tdp->scd_path != NULL) spa_strfree(tdp->scd_path); kmem_free(tdp, sizeof (spa_config_dirent_t)); } spa_config_generation++; if (postsysevent) spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC); /* * Post udev event to sync blkid information if the pool is created * or a new vdev is added to the pool. */ if ((target->spa_root_vdev) && postblkidevent) { vdev_post_kobj_evt(target->spa_root_vdev); for (int i = 0; i < target->spa_l2cache.sav_count; i++) vdev_post_kobj_evt(target->spa_l2cache.sav_vdevs[i]); for (int i = 0; i < target->spa_spares.sav_count; i++) vdev_post_kobj_evt(target->spa_spares.sav_vdevs[i]); } } /* * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache, * and we don't want to allow the local zone to see all the pools anyway. * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration * information for all pool visible within the zone. */ int spa_all_configs(uint64_t *generation, nvlist_t **pools) { spa_t *spa = NULL; if (*generation == spa_config_generation) return (SET_ERROR(EEXIST)); int error = mutex_enter_interruptible(&spa_namespace_lock); if (error) return (SET_ERROR(EINTR)); *pools = fnvlist_alloc(); while ((spa = spa_next(spa)) != NULL) { if (INGLOBALZONE(curproc) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); fnvlist_add_nvlist(*pools, spa_name(spa), spa->spa_config); mutex_exit(&spa->spa_props_lock); } } *generation = spa_config_generation; mutex_exit(&spa_namespace_lock); return (0); } void spa_config_set(spa_t *spa, nvlist_t *config) { mutex_enter(&spa->spa_props_lock); if (spa->spa_config != NULL && spa->spa_config != config) nvlist_free(spa->spa_config); spa->spa_config = config; mutex_exit(&spa->spa_props_lock); } /* * Generate the pool's configuration based on the current in-core state. * * We infer whether to generate a complete config or just one top-level config * based on whether vd is the root vdev. */ nvlist_t * spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) { nvlist_t *config, *nvroot; vdev_t *rvd = spa->spa_root_vdev; unsigned long hostid = 0; boolean_t locked = B_FALSE; uint64_t split_guid; const char *pool_name; if (vd == NULL) { vd = rvd; locked = B_TRUE; spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == (SCL_CONFIG | SCL_STATE)); /* * If txg is -1, report the current value of spa->spa_config_txg. */ if (txg == -1ULL) txg = spa->spa_config_txg; /* * Originally, users had to handle spa namespace collisions by either * exporting the already imported pool or by specifying a new name for * the pool with a conflicting name. In the case of root pools from * virtual guests, neither approach to collision resolution is * reasonable. This is addressed by extending the new name syntax with * an option to specify that the new name is temporary. When specified, * ZFS_IMPORT_TEMP_NAME will be set in spa->spa_import_flags to tell us * to use the previous name, which we do below. */ if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) { VERIFY0(nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME, &pool_name)); } else pool_name = spa_name(spa); config = fnvlist_alloc(); fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, pool_name); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata); + fnvlist_add_uint64(config, ZPOOL_CONFIG_MIN_ALLOC, spa->spa_min_alloc); + fnvlist_add_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, spa->spa_max_alloc); if (spa->spa_comment != NULL) fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, spa->spa_comment); if (spa->spa_compatibility != NULL) fnvlist_add_string(config, ZPOOL_CONFIG_COMPATIBILITY, spa->spa_compatibility); hostid = spa_get_hostid(spa); if (hostid != 0) fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid); fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname()->nodename); int config_gen_flags = 0; if (vd != rvd) { fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid); fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID, vd->vdev_guid); if (vd->vdev_isspare) fnvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE, 1ULL); if (vd->vdev_islog) fnvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG, 1ULL); vd = vd->vdev_top; /* label contains top config */ } else { /* * Only add the (potentially large) split information * in the mos config, and not in the vdev labels */ if (spa->spa_config_splitting != NULL) fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT, spa->spa_config_splitting); fnvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS); config_gen_flags |= VDEV_CONFIG_MOS; } /* * Add the top-level config. We even add this on pools which * don't support holes in the namespace. */ vdev_top_config_generate(spa, config); /* * If we're splitting, record the original pool's guid. */ if (spa->spa_config_splitting != NULL && nvlist_lookup_uint64(spa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) { fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, split_guid); } nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags); fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot); nvlist_free(nvroot); /* * Store what's necessary for reading the MOS in the label. */ fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, spa->spa_label_features); if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); ddt_get_dedup_histogram(spa, ddh); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)); kmem_free(ddh, sizeof (ddt_histogram_t)); ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP); ddt_get_dedup_object_stats(spa, ddo); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)); kmem_free(ddo, sizeof (ddt_object_t)); dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP); ddt_get_dedup_stats(spa, dds); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)); kmem_free(dds, sizeof (ddt_stat_t)); } if (locked) spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (config); } /* * Update all disk labels, generate a fresh config based on the current * in-core state, and sync the global config cache (do not sync the config * cache if this is a booting rootpool). */ void spa_config_update(spa_t *spa, int what) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg; int c; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); txg = spa_last_synced_txg(spa) + 1; if (what == SPA_CONFIG_UPDATE_POOL) { vdev_config_dirty(rvd); } else { /* * If we have top-level vdevs that were added but have * not yet been prepared for allocation, do that now. * (It's safe now because the config cache is up to date, * so it will be able to translate the new DVAs.) * See comments in spa_vdev_add() for full details. */ for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; /* * Explicitly skip vdevs that are indirect or * log vdevs that are being removed. The reason * is that both of those can have vdev_ms_array * set to 0 and we wouldn't want to change their * metaslab size nor call vdev_expand() on them. */ if (!vdev_is_concrete(tvd) || (tvd->vdev_islog && tvd->vdev_removing)) continue; if (tvd->vdev_ms_array == 0) vdev_metaslab_set_size(tvd); vdev_expand(tvd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); /* * Wait for the mosconfig to be regenerated and synced. */ txg_wait_synced(spa->spa_dsl_pool, txg); /* * Update the global config cache to reflect the new mosconfig. */ if (!spa->spa_is_root) { spa_write_cachefile(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL, what != SPA_CONFIG_UPDATE_POOL); } if (what == SPA_CONFIG_UPDATE_POOL) spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); } EXPORT_SYMBOL(spa_all_configs); EXPORT_SYMBOL(spa_config_set); EXPORT_SYMBOL(spa_config_generate); EXPORT_SYMBOL(spa_config_update); #ifdef __linux__ /* string sysctls require a char array on FreeBSD */ ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD, "SPA config file (/etc/zfs/zpool.cache)"); #endif diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 6f7c060f97f8..a539468df6a1 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1,3181 +1,3196 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K . All rights reserved. * Copyright (c) 2023, 2024, Klara Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include #include #include #include /* * SPA locking * * There are three basic locks for managing spa_t structures: * * spa_namespace_lock (global mutex) * * This lock must be acquired to do any of the following: * * - Lookup a spa_t by name * - Add or remove a spa_t from the namespace * - Increase spa_refcount from non-zero * - Check if spa_refcount is zero * - Rename a spa_t * - add/remove/attach/detach devices * - Held for the duration of create/destroy * - Held at the start and end of import and export * * It does not need to handle recursion. A create or destroy may * reference objects (files or zvols) in other pools, but by * definition they must have an existing reference, and will never need * to lookup a spa_t by name. * * spa_refcount (per-spa zfs_refcount_t protected by mutex) * * This reference count keep track of any active users of the spa_t. The * spa_t cannot be destroyed or freed while this is non-zero. Internally, * the refcount is never really 'zero' - opening a pool implicitly keeps * some references in the DMU. Internally we check against spa_minref, but * present the image of a zero/non-zero value to consumers. * * spa_config_lock[] (per-spa array of rwlocks) * * This protects the spa_t from config changes, and must be held in * the following circumstances: * * - RW_READER to perform I/O to the spa * - RW_WRITER to change the vdev config * * The locking order is fairly straightforward: * * spa_namespace_lock -> spa_refcount * * The namespace lock must be acquired to increase the refcount from 0 * or to check if it is zero. * * spa_refcount -> spa_config_lock[] * * There must be at least one valid reference on the spa_t to acquire * the config lock. * * spa_namespace_lock -> spa_config_lock[] * * The namespace lock must always be taken before the config lock. * * * The spa_namespace_lock can be acquired directly and is globally visible. * * The namespace is manipulated using the following functions, all of which * require the spa_namespace_lock to be held. * * spa_lookup() Lookup a spa_t by name. * * spa_add() Create a new spa_t in the namespace. * * spa_remove() Remove a spa_t from the namespace. This also * frees up any memory associated with the spa_t. * * spa_next() Returns the next spa_t in the system, or the * first if NULL is passed. * * spa_evict_all() Shutdown and remove all spa_t structures in * the system. * * spa_guid_exists() Determine whether a pool/device guid exists. * * The spa_refcount is manipulated using the following functions: * * spa_open_ref() Adds a reference to the given spa_t. Must be * called with spa_namespace_lock held if the * refcount is currently zero. * * spa_close() Remove a reference from the spa_t. This will * not free the spa_t or remove it from the * namespace. No locking is required. * * spa_refcount_zero() Returns true if the refcount is currently * zero. Must be called with spa_namespace_lock * held. * * The spa_config_lock[] is an array of rwlocks, ordered as follows: * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). * * To read the configuration, it suffices to hold one of these locks as reader. * To modify the configuration, you must hold all locks as writer. To modify * vdev state without altering the vdev tree's topology (e.g. online/offline), * you must hold SCL_STATE and SCL_ZIO as writer. * * We use these distinct config locks to avoid recursive lock entry. * For example, spa_sync() (which holds SCL_CONFIG as reader) induces * block allocations (SCL_ALLOC), which may require reading space maps * from disk (dmu_read() -> zio_read() -> SCL_ZIO). * * The spa config locks cannot be normal rwlocks because we need the * ability to hand off ownership. For example, SCL_ZIO is acquired * by the issuing thread and later released by an interrupt thread. * They do, however, obey the usual write-wanted semantics to prevent * writer (i.e. system administrator) starvation. * * The lock acquisition rules are as follows: * * SCL_CONFIG * Protects changes to the vdev tree topology, such as vdev * add/remove/attach/detach. Protects the dirty config list * (spa_config_dirty_list) and the set of spares and l2arc devices. * * SCL_STATE * Protects changes to pool state and vdev state, such as vdev * online/offline/fault/degrade/clear. Protects the dirty state list * (spa_state_dirty_list) and global pool state (spa_state). * * SCL_ALLOC * Protects changes to metaslab groups and classes. * Held as reader by metaslab_alloc() and metaslab_claim(). * * SCL_ZIO * Held by bp-level zios (those which have no io_vd upon entry) * to prevent changes to the vdev tree. The bp-level zio implicitly * protects all of its vdev child zios, which do not hold SCL_ZIO. * * SCL_FREE * Protects changes to metaslab groups and classes. * Held as reader by metaslab_free(). SCL_FREE is distinct from * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free * blocks in zio_done() while another i/o that holds either * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. * * SCL_VDEV * Held as reader to prevent changes to the vdev tree during trivial * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the * other locks, and lower than all of them, to ensure that it's safe * to acquire regardless of caller context. * * In addition, the following rules apply: * * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. * The lock ordering is SCL_CONFIG > spa_props_lock. * * (b) I/O operations on leaf vdevs. For any zio operation that takes * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), * or zio_write_phys() -- the caller must ensure that the config cannot * cannot change in the interim, and that the vdev cannot be reopened. * SCL_STATE as reader suffices for both. * * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). * * spa_vdev_enter() Acquire the namespace lock and the config lock * for writing. * * spa_vdev_exit() Release the config lock, wait for all I/O * to complete, sync the updated configs to the * cache, and release the namespace lock. * * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual * locking is, always, based on spa_namespace_lock and spa_config_lock[]. */ avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; kcondvar_t spa_namespace_cv; static const int spa_max_replication_override = SPA_DVAS_PER_BP; static kmutex_t spa_spare_lock; static avl_tree_t spa_spare_avl; static kmutex_t spa_l2cache_lock; static avl_tree_t spa_l2cache_avl; spa_mode_t spa_mode_global = SPA_MODE_UNINIT; #ifdef ZFS_DEBUG /* * Everything except dprintf, set_error, indirect_remap, and raidz_reconstruct * is on by default in debug builds. */ int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR | ZFS_DEBUG_INDIRECT_REMAP | ZFS_DEBUG_RAIDZ_RECONSTRUCT); #else int zfs_flags = 0; #endif /* * zfs_recover can be set to nonzero to attempt to recover from * otherwise-fatal errors, typically caused by on-disk corruption. When * set, calls to zfs_panic_recover() will turn into warning messages. * This should only be used as a last resort, as it typically results * in leaked space, or worse. */ int zfs_recover = B_FALSE; /* * If destroy encounters an EIO while reading metadata (e.g. indirect * blocks), space referenced by the missing metadata can not be freed. * Normally this causes the background destroy to become "stalled", as * it is unable to make forward progress. While in this stalled state, * all remaining space to free from the error-encountering filesystem is * "temporarily leaked". Set this flag to cause it to ignore the EIO, * permanently leak the space from indirect blocks that can not be read, * and continue to free everything else that it can. * * The default, "stalling" behavior is useful if the storage partially * fails (i.e. some but not all i/os fail), and then later recovers. In * this case, we will be able to continue pool operations while it is * partially failed, and when it recovers, we can continue to free the * space, with no leaks. However, note that this case is actually * fairly rare. * * Typically pools either (a) fail completely (but perhaps temporarily, * e.g. a top-level vdev going offline), or (b) have localized, * permanent errors (e.g. disk returns the wrong data due to bit flip or * firmware bug). In case (a), this setting does not matter because the * pool will be suspended and the sync thread will not be able to make * forward progress regardless. In case (b), because the error is * permanent, the best we can do is leak the minimum amount of space, * which is what setting this flag will do. Therefore, it is reasonable * for this flag to normally be set, but we chose the more conservative * approach of not setting it, so that there is no possibility of * leaking space in the "partial temporary" failure case. */ int zfs_free_leak_on_eio = B_FALSE; /* * Expiration time in milliseconds. This value has two meanings. First it is * used to determine when the spa_deadman() logic should fire. By default the * spa_deadman() will fire if spa_sync() has not completed in 600 seconds. * Secondly, the value determines if an I/O is considered "hung". Any I/O that * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting * in one of three behaviors controlled by zfs_deadman_failmode. */ uint64_t zfs_deadman_synctime_ms = 600000UL; /* 10 min. */ /* * This value controls the maximum amount of time zio_wait() will block for an * outstanding IO. By default this is 300 seconds at which point the "hung" * behavior will be applied as described for zfs_deadman_synctime_ms. */ uint64_t zfs_deadman_ziotime_ms = 300000UL; /* 5 min. */ /* * Check time in milliseconds. This defines the frequency at which we check * for hung I/O. */ uint64_t zfs_deadman_checktime_ms = 60000UL; /* 1 min. */ /* * By default the deadman is enabled. */ int zfs_deadman_enabled = B_TRUE; /* * Controls the behavior of the deadman when it detects a "hung" I/O. * Valid values are zfs_deadman_failmode=. * * wait - Wait for the "hung" I/O (default) * continue - Attempt to recover from a "hung" I/O * panic - Panic the system */ const char *zfs_deadman_failmode = "wait"; /* * The worst case is single-sector max-parity RAID-Z blocks, in which * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) * times the size; so just assume that. Add to this the fact that * we can have up to 3 DVAs per bp, and one more factor of 2 because * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, * the worst case is: * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 */ uint_t spa_asize_inflation = 24; /* * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in * the pool to be consumed (bounded by spa_max_slop). This ensures that we * don't run the pool completely out of space, due to unaccounted changes (e.g. * to the MOS). It also limits the worst-case time to allocate space. If we * have less than this amount of free space, most ZPL operations (e.g. write, * create) will return ENOSPC. The ZIL metaslabs (spa_embedded_log_class) are * also part of this 3.2% of space which can't be consumed by normal writes; * the slop space "proper" (spa_get_slop_space()) is decreased by the embedded * log space. * * Certain operations (e.g. file removal, most administrative actions) can * use half the slop space. They will only return ENOSPC if less than half * the slop space is free. Typically, once the pool has less than the slop * space free, the user will use these operations to free up space in the pool. * These are the operations that call dsl_pool_adjustedsize() with the netfree * argument set to TRUE. * * Operations that are almost guaranteed to free up space in the absence of * a pool checkpoint can use up to three quarters of the slop space * (e.g zfs destroy). * * A very restricted set of operations are always permitted, regardless of * the amount of free space. These are the operations that call * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net * increase in the amount of space used, it is possible to run the pool * completely out of space, causing it to be permanently read-only. * * Note that on very small pools, the slop space will be larger than * 3.2%, in an effort to have it be at least spa_min_slop (128MB), * but we never allow it to be more than half the pool size. * * Further, on very large pools, the slop space will be smaller than * 3.2%, to avoid reserving much more space than we actually need; bounded * by spa_max_slop (128GB). * * See also the comments in zfs_space_check_t. */ uint_t spa_slop_shift = 5; static const uint64_t spa_min_slop = 128ULL * 1024 * 1024; static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024; /* * Number of allocators to use, per spa instance */ static int spa_num_allocators = 4; static int spa_cpus_per_allocator = 4; /* * Spa active allocator. * Valid values are zfs_active_allocator=. */ const char *zfs_active_allocator = "dynamic"; void spa_load_failed(spa_t *spa, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name, spa->spa_trust_config ? "trusted" : "untrusted", buf); } void spa_load_note(spa_t *spa, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name, spa->spa_trust_config ? "trusted" : "untrusted", buf); spa_import_progress_set_notes_nolog(spa, "%s", buf); } /* * By default dedup and user data indirects land in the special class */ static int zfs_ddt_data_is_special = B_TRUE; static int zfs_user_indirect_is_special = B_TRUE; /* * The percentage of special class final space reserved for metadata only. * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only * let metadata into the class. */ static uint_t zfs_special_class_metadata_reserve_pct = 25; /* * ========================================================================== * SPA config locking * ========================================================================== */ static void spa_config_lock_init(spa_t *spa) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); scl->scl_writer = NULL; scl->scl_write_wanted = 0; scl->scl_count = 0; } } static void spa_config_lock_destroy(spa_t *spa) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_destroy(&scl->scl_lock); cv_destroy(&scl->scl_cv); ASSERT0P(scl->scl_writer); ASSERT0(scl->scl_write_wanted); ASSERT0(scl->scl_count); } } int spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { if (scl->scl_writer || scl->scl_write_wanted) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks & ((1 << i) - 1), tag); return (0); } } else { ASSERT(scl->scl_writer != curthread); if (scl->scl_count != 0) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks & ((1 << i) - 1), tag); return (0); } scl->scl_writer = curthread; } scl->scl_count++; mutex_exit(&scl->scl_lock); } return (1); } static void spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw, int mmp_flag) { (void) tag; int wlocks_held = 0; ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (scl->scl_writer == curthread) wlocks_held |= (1 << i); if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { while (scl->scl_writer || (!mmp_flag && scl->scl_write_wanted)) { cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { ASSERT(scl->scl_writer != curthread); while (scl->scl_count != 0) { scl->scl_write_wanted++; cv_wait(&scl->scl_cv, &scl->scl_lock); scl->scl_write_wanted--; } scl->scl_writer = curthread; } scl->scl_count++; mutex_exit(&scl->scl_lock); } ASSERT3U(wlocks_held, <=, locks); } void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) { spa_config_enter_impl(spa, locks, tag, rw, 0); } /* * The spa_config_enter_mmp() allows the mmp thread to cut in front of * outstanding write lock requests. This is needed since the mmp updates are * time sensitive and failure to service them promptly will result in a * suspended pool. This pool suspension has been seen in practice when there is * a single disk in a pool that is responding slowly and presumably about to * fail. */ void spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw) { spa_config_enter_impl(spa, locks, tag, rw, 1); } void spa_config_exit(spa_t *spa, int locks, const void *tag) { (void) tag; for (int i = SCL_LOCKS - 1; i >= 0; i--) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); ASSERT(scl->scl_count > 0); if (--scl->scl_count == 0) { ASSERT(scl->scl_writer == NULL || scl->scl_writer == curthread); scl->scl_writer = NULL; /* OK in either case */ cv_broadcast(&scl->scl_cv); } mutex_exit(&scl->scl_lock); } } int spa_config_held(spa_t *spa, int locks, krw_t rw) { int locks_held = 0; for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; if ((rw == RW_READER && scl->scl_count != 0) || (rw == RW_WRITER && scl->scl_writer == curthread)) locks_held |= 1 << i; } return (locks_held); } /* * ========================================================================== * SPA namespace functions * ========================================================================== */ /* * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. * Returns NULL if no matching spa_t is found. */ spa_t * spa_lookup(const char *name) { static spa_t search; /* spa_t is large; don't allocate on stack */ spa_t *spa; avl_index_t where; char *cp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); retry: (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); /* * If it's a full dataset name, figure out the pool name and * just use that. */ cp = strpbrk(search.spa_name, "/@#"); if (cp != NULL) *cp = '\0'; spa = avl_find(&spa_namespace_avl, &search, &where); if (spa == NULL) return (NULL); /* * Avoid racing with import/export, which don't hold the namespace * lock for their entire duration. */ if ((spa->spa_load_thread != NULL && spa->spa_load_thread != curthread) || (spa->spa_export_thread != NULL && spa->spa_export_thread != curthread)) { cv_wait(&spa_namespace_cv, &spa_namespace_lock); goto retry; } return (spa); } /* * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. * If the zfs_deadman_enabled flag is set then it inspects all vdev queues * looking for potentially hung I/Os. */ void spa_deadman(void *arg) { spa_t *spa = arg; /* Disable the deadman if the pool is suspended. */ if (spa_suspended(spa)) return; zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", (gethrtime() - spa->spa_sync_starttime) / NANOSEC, (u_longlong_t)++spa->spa_deadman_calls); if (zfs_deadman_enabled) vdev_deadman(spa->spa_root_vdev, FTAG); spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + MSEC_TO_TICK(zfs_deadman_checktime_ms)); } static int spa_log_sm_sort_by_txg(const void *va, const void *vb) { const spa_log_sm_t *a = va; const spa_log_sm_t *b = vb; return (TREE_CMP(a->sls_txg, b->sls_txg)); } /* * Create an uninitialized spa_t with the given name. Requires * spa_namespace_lock. The caller must ensure that the spa_t doesn't already * exist by calling spa_lookup() first. */ spa_t * spa_add(const char *name, nvlist_t *config, const char *altroot) { spa_t *spa; spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_txg_log_time_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_activities_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_waiters_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < TXG_SIZE; t++) bplist_create(&spa->spa_free_bplist[t]); (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); spa->spa_state = POOL_STATE_UNINITIALIZED; spa->spa_freeze_txg = UINT64_MAX; spa->spa_final_txg = UINT64_MAX; spa->spa_load_max_txg = UINT64_MAX; spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; spa->spa_trust_config = B_TRUE; spa->spa_hostid = zone_get_hostid(NULL); spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms); spa_set_deadman_failmode(spa, zfs_deadman_failmode); spa_set_allocator(spa, zfs_active_allocator); zfs_refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); spa_stats_init(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); avl_add(&spa_namespace_avl, spa); /* * Set the alternate root, if there is one. */ if (altroot) spa->spa_root = spa_strdup(altroot); /* Do not allow more allocators than fraction of CPUs. */ spa->spa_alloc_count = MAX(MIN(spa_num_allocators, boot_ncpus / MAX(spa_cpus_per_allocator, 1)), 1); if (spa->spa_alloc_count > 1) { spa->spa_allocs_use = kmem_zalloc(offsetof(spa_allocs_use_t, sau_inuse[spa->spa_alloc_count]), KM_SLEEP); mutex_init(&spa->spa_allocs_use->sau_lock, NULL, MUTEX_DEFAULT, NULL); } avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg, sizeof (spa_log_sm_t), offsetof(spa_log_sm_t, sls_node)); list_create(&spa->spa_log_summary, sizeof (log_summary_entry_t), offsetof(log_summary_entry_t, lse_node)); /* * Every pool starts with the default cachefile */ list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), offsetof(spa_config_dirent_t, scd_link)); dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); list_insert_head(&spa->spa_config_list, dp); VERIFY0(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, KM_SLEEP)); if (config != NULL) { nvlist_t *features; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) == 0) { VERIFY0(nvlist_dup(features, &spa->spa_label_features, 0)); } VERIFY0(nvlist_dup(config, &spa->spa_config, 0)); } if (spa->spa_label_features == NULL) { VERIFY0(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, KM_SLEEP)); } spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; spa->spa_min_alloc = INT_MAX; + spa->spa_max_alloc = 0; spa->spa_gcd_alloc = INT_MAX; /* Reset cached value */ spa->spa_dedup_dspace = ~0ULL; /* * As a pool is being created, treat all features as disabled by * setting SPA_FEATURE_DISABLED for all entries in the feature * refcount cache. */ for (int i = 0; i < SPA_FEATURES; i++) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } list_create(&spa->spa_leaf_list, sizeof (vdev_t), offsetof(vdev_t, vdev_leaf_node)); return (spa); } /* * Removes a spa_t from the namespace, freeing up any memory used. Requires * spa_namespace_lock. This is called only after the spa_t has been closed and * deactivated. */ void spa_remove(spa_t *spa) { spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_state(spa) == POOL_STATE_UNINITIALIZED); ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0); ASSERT0(spa->spa_waiters); nvlist_free(spa->spa_config_splitting); avl_remove(&spa_namespace_avl, spa); if (spa->spa_root) spa_strfree(spa->spa_root); while ((dp = list_remove_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path != NULL) spa_strfree(dp->scd_path); kmem_free(dp, sizeof (spa_config_dirent_t)); } if (spa->spa_alloc_count > 1) { mutex_destroy(&spa->spa_allocs_use->sau_lock); kmem_free(spa->spa_allocs_use, offsetof(spa_allocs_use_t, sau_inuse[spa->spa_alloc_count])); } avl_destroy(&spa->spa_metaslabs_by_flushed); avl_destroy(&spa->spa_sm_logs_by_txg); list_destroy(&spa->spa_log_summary); list_destroy(&spa->spa_config_list); list_destroy(&spa->spa_leaf_list); nvlist_free(spa->spa_label_features); nvlist_free(spa->spa_load_info); nvlist_free(spa->spa_feat_stats); spa_config_set(spa, NULL); zfs_refcount_destroy(&spa->spa_refcount); spa_stats_destroy(spa); spa_config_lock_destroy(spa); for (int t = 0; t < TXG_SIZE; t++) bplist_destroy(&spa->spa_free_bplist[t]); zio_checksum_templates_free(spa); cv_destroy(&spa->spa_async_cv); cv_destroy(&spa->spa_evicting_os_cv); cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); cv_destroy(&spa->spa_activities_cv); cv_destroy(&spa->spa_waiters_cv); mutex_destroy(&spa->spa_flushed_ms_lock); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); mutex_destroy(&spa->spa_evicting_os_lock); mutex_destroy(&spa->spa_history_lock); mutex_destroy(&spa->spa_proc_lock); mutex_destroy(&spa->spa_props_lock); mutex_destroy(&spa->spa_cksum_tmpls_lock); mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); mutex_destroy(&spa->spa_feat_stats_lock); mutex_destroy(&spa->spa_activities_lock); mutex_destroy(&spa->spa_txg_log_time_lock); kmem_free(spa, sizeof (spa_t)); } /* * Given a pool, return the next pool in the namespace, or NULL if there is * none. If 'prev' is NULL, return the first pool. */ spa_t * spa_next(spa_t *prev) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (prev) return (AVL_NEXT(&spa_namespace_avl, prev)); else return (avl_first(&spa_namespace_avl)); } /* * ========================================================================== * SPA refcount functions * ========================================================================== */ /* * Add a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. */ void spa_open_ref(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref || MUTEX_HELD(&spa_namespace_lock) || spa->spa_load_thread == curthread); (void) zfs_refcount_add(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held or be part of a pool import/export. */ void spa_close(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref || MUTEX_HELD(&spa_namespace_lock) || spa->spa_load_thread == curthread || spa->spa_export_thread == curthread); (void) zfs_refcount_remove(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t held by a dsl dir that is * being asynchronously released. Async releases occur from a taskq * performing eviction of dsl datasets and dirs. The namespace lock * isn't held and the hold by the object being evicted may contribute to * spa_minref (e.g. dataset or directory released during pool export), * so the asserts in spa_close() do not apply. */ void spa_async_close(spa_t *spa, const void *tag) { (void) zfs_refcount_remove(&spa->spa_refcount, tag); } /* * Check to see if the spa refcount is zero. Must be called with * spa_namespace_lock held or be the spa export thread. We really * compare against spa_minref, which is the number of references * acquired when opening a pool */ boolean_t spa_refcount_zero(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock) || spa->spa_export_thread == curthread); return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref); } /* * ========================================================================== * SPA spare and l2cache tracking * ========================================================================== */ /* * Hot spares and cache devices are tracked using the same code below, * for 'auxiliary' devices. */ typedef struct spa_aux { uint64_t aux_guid; uint64_t aux_pool; avl_node_t aux_avl; int aux_count; } spa_aux_t; static inline int spa_aux_compare(const void *a, const void *b) { const spa_aux_t *sa = (const spa_aux_t *)a; const spa_aux_t *sb = (const spa_aux_t *)b; return (TREE_CMP(sa->aux_guid, sb->aux_guid)); } static void spa_aux_add(vdev_t *vd, avl_tree_t *avl) { avl_index_t where; spa_aux_t search; spa_aux_t *aux; search.aux_guid = vd->vdev_guid; if ((aux = avl_find(avl, &search, &where)) != NULL) { aux->aux_count++; } else { aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); aux->aux_guid = vd->vdev_guid; aux->aux_count = 1; avl_insert(avl, aux, where); } } static void spa_aux_remove(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search; spa_aux_t *aux; avl_index_t where; search.aux_guid = vd->vdev_guid; aux = avl_find(avl, &search, &where); ASSERT(aux != NULL); if (--aux->aux_count == 0) { avl_remove(avl, aux); kmem_free(aux, sizeof (spa_aux_t)); } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { aux->aux_pool = 0ULL; } } static boolean_t spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) { spa_aux_t search, *found; search.aux_guid = guid; found = avl_find(avl, &search, NULL); if (pool) { if (found) *pool = found->aux_pool; else *pool = 0ULL; } if (refcnt) { if (found) *refcnt = found->aux_count; else *refcnt = 0; } return (found != NULL); } static void spa_aux_activate(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search, *found; avl_index_t where; search.aux_guid = vd->vdev_guid; found = avl_find(avl, &search, &where); ASSERT(found != NULL); ASSERT(found->aux_pool == 0ULL); found->aux_pool = spa_guid(vd->vdev_spa); } /* * Spares are tracked globally due to the following constraints: * * - A spare may be part of multiple pools. * - A spare may be added to a pool even if it's actively in use within * another pool. * - A spare in use in any pool can only be the source of a replacement if * the target is a spare in the same pool. * * We keep track of all spares on the system through the use of a reference * counted AVL tree. When a vdev is added as a spare, or used as a replacement * spare, then we bump the reference count in the AVL tree. In addition, we set * the 'vdev_isspare' member to indicate that the device is a spare (active or * inactive). When a spare is made active (used to replace a device in the * pool), we also keep track of which pool its been made a part of. * * The 'spa_spare_lock' protects the AVL tree. These functions are normally * called under the spa_namespace lock as part of vdev reconfiguration. The * separate spare lock exists for the status query path, which does not need to * be completely consistent with respect to other vdev configuration changes. */ static int spa_spare_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_spare_add(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(!vd->vdev_isspare); spa_aux_add(vd, &spa_spare_avl); vd->vdev_isspare = B_TRUE; mutex_exit(&spa_spare_lock); } void spa_spare_remove(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_remove(vd, &spa_spare_avl); vd->vdev_isspare = B_FALSE; mutex_exit(&spa_spare_lock); } boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) { boolean_t found; mutex_enter(&spa_spare_lock); found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); mutex_exit(&spa_spare_lock); return (found); } void spa_spare_activate(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_activate(vd, &spa_spare_avl); mutex_exit(&spa_spare_lock); } /* * Level 2 ARC devices are tracked globally for the same reasons as spares. * Cache devices currently only support one pool per cache device, and so * for these devices the aux reference count is currently unused beyond 1. */ static int spa_l2cache_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_l2cache_add(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(!vd->vdev_isl2cache); spa_aux_add(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_TRUE; mutex_exit(&spa_l2cache_lock); } void spa_l2cache_remove(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_remove(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_FALSE; mutex_exit(&spa_l2cache_lock); } boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool) { boolean_t found; mutex_enter(&spa_l2cache_lock); found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); return (found); } void spa_l2cache_activate(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_activate(vd, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); } /* * ========================================================================== * SPA vdev locking * ========================================================================== */ /* * Lock the given spa_t for the purpose of adding or removing a vdev. * Grabs the global spa_namespace_lock plus the spa config lock for writing. * It returns the next transaction group for the spa_t. */ uint64_t spa_vdev_enter(spa_t *spa) { mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); ASSERT0(spa->spa_export_thread); vdev_autotrim_stop_all(spa); return (spa_vdev_config_enter(spa)); } /* * The same as spa_vdev_enter() above but additionally takes the guid of * the vdev being detached. When there is a rebuild in process it will be * suspended while the vdev tree is modified then resumed by spa_vdev_exit(). * The rebuild is canceled if only a single child remains after the detach. */ uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid) { mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); ASSERT0(spa->spa_export_thread); vdev_autotrim_stop_all(spa); if (guid != 0) { vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd) { vdev_rebuild_stop_wait(vd->vdev_top); } } return (spa_vdev_config_enter(spa)); } /* * Internal implementation for spa_vdev_enter(). Used when a vdev * operation requires multiple syncs (i.e. removing a device) while * keeping the spa_namespace_lock held. */ uint64_t spa_vdev_config_enter(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); return (spa_last_synced_txg(spa) + 1); } /* * Used in combination with spa_vdev_config_enter() to allow the syncing * of multiple transactions without releasing the spa_namespace_lock. */ void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, const char *tag) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); int config_changed = B_FALSE; ASSERT(txg > spa_last_synced_txg(spa)); spa->spa_pending_vdev = NULL; /* * Reassess the DTLs. */ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE); if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { config_changed = B_TRUE; spa->spa_config_generation++; } /* * Verify the metaslab classes. */ metaslab_class_validate(spa_normal_class(spa)); metaslab_class_validate(spa_log_class(spa)); metaslab_class_validate(spa_embedded_log_class(spa)); metaslab_class_validate(spa_special_class(spa)); metaslab_class_validate(spa_special_embedded_log_class(spa)); metaslab_class_validate(spa_dedup_class(spa)); spa_config_exit(spa, SCL_ALL, spa); /* * Panic the system if the specified tag requires it. This * is useful for ensuring that configurations are updated * transactionally. */ if (zio_injection_enabled) zio_handle_panic_injection(spa, tag, 0); /* * Note: this txg_wait_synced() is important because it ensures * that there won't be more than one config change per txg. * This allows us to use the txg as the generation number. */ if (error == 0) txg_wait_synced(spa->spa_dsl_pool, txg); if (vd != NULL) { ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); if (vd->vdev_ops->vdev_op_leaf) { mutex_enter(&vd->vdev_initialize_lock); vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, NULL); mutex_exit(&vd->vdev_initialize_lock); mutex_enter(&vd->vdev_trim_lock); vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL); mutex_exit(&vd->vdev_trim_lock); } /* * The vdev may be both a leaf and top-level device. */ vdev_autotrim_stop_wait(vd); spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER); vdev_free(vd); spa_config_exit(spa, SCL_STATE_ALL, spa); } /* * If the config changed, update the config cache. */ if (config_changed) spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); } /* * Unlock the spa_t after adding or removing a vdev. Besides undoing the * locking of spa_vdev_enter(), we also want make sure the transactions have * synced to disk, and then update the global configuration cache with the new * information. */ int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) { vdev_autotrim_restart(spa); vdev_rebuild_restart(spa); spa_vdev_config_exit(spa, vd, txg, error, FTAG); mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Lock the given spa_t for the purpose of changing vdev state. */ void spa_vdev_state_enter(spa_t *spa, int oplocks) { int locks = SCL_STATE_ALL | oplocks; /* * Root pools may need to read of the underlying devfs filesystem * when opening up a vdev. Unfortunately if we're holding the * SCL_ZIO lock it will result in a deadlock when we try to issue * the read from the root filesystem. Instead we "prefetch" * the associated vnodes that we need prior to opening the * underlying devices and cache them so that we can prevent * any I/O when we are doing the actual open. */ if (spa_is_root(spa)) { int low = locks & ~(SCL_ZIO - 1); int high = locks & ~low; spa_config_enter(spa, high, spa, RW_WRITER); vdev_hold(spa->spa_root_vdev); spa_config_enter(spa, low, spa, RW_WRITER); } else { spa_config_enter(spa, locks, spa, RW_WRITER); } spa->spa_vdev_locks = locks; } int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) { boolean_t config_changed = B_FALSE; vdev_t *vdev_top; if (vd == NULL || vd == spa->spa_root_vdev) { vdev_top = spa->spa_root_vdev; } else { vdev_top = vd->vdev_top; } if (vd != NULL || error == 0) vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE); if (vd != NULL) { if (vd != spa->spa_root_vdev) vdev_state_dirty(vdev_top); config_changed = B_TRUE; spa->spa_config_generation++; } if (spa_is_root(spa)) vdev_rele(spa->spa_root_vdev); ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); spa_config_exit(spa, spa->spa_vdev_locks, spa); /* * If anything changed, wait for it to sync. This ensures that, * from the system administrator's perspective, zpool(8) commands * are synchronous. This is important for things like zpool offline: * when the command completes, you expect no further I/O from ZFS. */ if (vd != NULL) txg_wait_synced(spa->spa_dsl_pool, 0); /* * If the config changed, update the config cache. */ if (config_changed) { mutex_enter(&spa_namespace_lock); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); mutex_exit(&spa_namespace_lock); } return (error); } /* * ========================================================================== * Miscellaneous functions * ========================================================================== */ void spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx) { if (!nvlist_exists(spa->spa_label_features, feature)) { fnvlist_add_boolean(spa->spa_label_features, feature); /* * When we are creating the pool (tx_txg==TXG_INITIAL), we can't * dirty the vdev config because lock SCL_CONFIG is not held. * Thankfully, in this case we don't need to dirty the config * because it will be written out anyway when we finish * creating the pool. */ if (tx->tx_txg != TXG_INITIAL) vdev_config_dirty(spa->spa_root_vdev); } } void spa_deactivate_mos_feature(spa_t *spa, const char *feature) { if (nvlist_remove_all(spa->spa_label_features, feature) == 0) vdev_config_dirty(spa->spa_root_vdev); } /* * Return the spa_t associated with given pool_guid, if it exists. If * device_guid is non-zero, determine whether the pool exists *and* contains * a device with the specified device_guid. */ spa_t * spa_by_guid(uint64_t pool_guid, uint64_t device_guid) { spa_t *spa; avl_tree_t *t = &spa_namespace_avl; ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { if (spa->spa_state == POOL_STATE_UNINITIALIZED) continue; if (spa->spa_root_vdev == NULL) continue; if (spa_guid(spa) == pool_guid) { if (device_guid == 0) break; if (vdev_lookup_by_guid(spa->spa_root_vdev, device_guid) != NULL) break; /* * Check any devices we may be in the process of adding. */ if (spa->spa_pending_vdev) { if (vdev_lookup_by_guid(spa->spa_pending_vdev, device_guid) != NULL) break; } } } return (spa); } /* * Determine whether a pool with the given pool_guid exists. */ boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) { return (spa_by_guid(pool_guid, device_guid) != NULL); } char * spa_strdup(const char *s) { size_t len; char *new; len = strlen(s); new = kmem_alloc(len + 1, KM_SLEEP); memcpy(new, s, len + 1); return (new); } void spa_strfree(char *s) { kmem_free(s, strlen(s) + 1); } uint64_t spa_generate_guid(spa_t *spa) { uint64_t guid; if (spa != NULL) { do { (void) random_get_pseudo_bytes((void *)&guid, sizeof (guid)); } while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)); } else { do { (void) random_get_pseudo_bytes((void *)&guid, sizeof (guid)); } while (guid == 0 || spa_guid_exists(guid, 0)); } return (guid); } static boolean_t spa_load_guid_exists(uint64_t guid) { avl_tree_t *t = &spa_namespace_avl; ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (spa_t *spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { if (spa_load_guid(spa) == guid) return (B_TRUE); } return (arc_async_flush_guid_inuse(guid)); } uint64_t spa_generate_load_guid(void) { uint64_t guid; do { (void) random_get_pseudo_bytes((void *)&guid, sizeof (guid)); } while (guid == 0 || spa_load_guid_exists(guid)); return (guid); } void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) { char type[256]; const char *checksum = NULL; const char *compress = NULL; if (bp != NULL) { if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); (void) snprintf(type, sizeof (type), "bswap %s %s", DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? "metadata" : "data", dmu_ot_byteswap[bswap].ob_name); } else { (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, sizeof (type)); } if (!BP_IS_EMBEDDED(bp)) { checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; } compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } SNPRINTF_BLKPTR(kmem_scnprintf, ' ', buf, buflen, bp, type, checksum, compress); } void spa_freeze(spa_t *spa) { uint64_t freeze_txg = 0; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); if (spa->spa_freeze_txg == UINT64_MAX) { freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; spa->spa_freeze_txg = freeze_txg; } spa_config_exit(spa, SCL_ALL, FTAG); if (freeze_txg != 0) txg_wait_synced(spa_get_dsl(spa), freeze_txg); } void zfs_panic_recover(const char *fmt, ...) { va_list adx; va_start(adx, fmt); vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); va_end(adx); } /* * This is a stripped-down version of strtoull, suitable only for converting * lowercase hexadecimal numbers that don't overflow. */ uint64_t zfs_strtonum(const char *str, char **nptr) { uint64_t val = 0; char c; int digit; while ((c = *str) != '\0') { if (c >= '0' && c <= '9') digit = c - '0'; else if (c >= 'a' && c <= 'f') digit = 10 + c - 'a'; else break; val *= 16; val += digit; str++; } if (nptr) *nptr = (char *)str; return (val); } void spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx) { /* * We bump the feature refcount for each special vdev added to the pool */ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)); spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx); } /* * ========================================================================== * Accessor functions * ========================================================================== */ boolean_t spa_shutting_down(spa_t *spa) { return (spa->spa_async_suspended); } dsl_pool_t * spa_get_dsl(spa_t *spa) { return (spa->spa_dsl_pool); } boolean_t spa_is_initializing(spa_t *spa) { return (spa->spa_is_initializing); } boolean_t spa_indirect_vdevs_loaded(spa_t *spa) { return (spa->spa_indirect_vdevs_loaded); } blkptr_t * spa_get_rootblkptr(spa_t *spa) { return (&spa->spa_ubsync.ub_rootbp); } void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) { spa->spa_uberblock.ub_rootbp = *bp; } void spa_altroot(spa_t *spa, char *buf, size_t buflen) { if (spa->spa_root == NULL) buf[0] = '\0'; else (void) strlcpy(buf, spa->spa_root, buflen); } uint32_t spa_sync_pass(spa_t *spa) { return (spa->spa_sync_pass); } char * spa_name(spa_t *spa) { return (spa->spa_name); } uint64_t spa_guid(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); uint64_t guid; /* * If we fail to parse the config during spa_load(), we can go through * the error path (which posts an ereport) and end up here with no root * vdev. We stash the original pool guid in 'spa_config_guid' to handle * this case. */ if (spa->spa_root_vdev == NULL) return (spa->spa_config_guid); guid = spa->spa_last_synced_guid != 0 ? spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; /* * Return the most recently synced out guid unless we're * in syncing context. */ if (dp && dsl_pool_sync_context(dp)) return (spa->spa_root_vdev->vdev_guid); else return (guid); } uint64_t spa_load_guid(spa_t *spa) { /* * This is a GUID that exists solely as a reference for the * purposes of the arc. It is generated at load time, and * is never written to persistent storage. */ return (spa->spa_load_guid); } uint64_t spa_last_synced_txg(spa_t *spa) { return (spa->spa_ubsync.ub_txg); } uint64_t spa_first_txg(spa_t *spa) { return (spa->spa_first_txg); } uint64_t spa_syncing_txg(spa_t *spa) { return (spa->spa_syncing_txg); } /* * Return the last txg where data can be dirtied. The final txgs * will be used to just clear out any deferred frees that remain. */ uint64_t spa_final_dirty_txg(spa_t *spa) { return (spa->spa_final_txg - TXG_DEFER_SIZE); } pool_state_t spa_state(spa_t *spa) { return (spa->spa_state); } spa_load_state_t spa_load_state(spa_t *spa) { return (spa->spa_load_state); } uint64_t spa_freeze_txg(spa_t *spa) { return (spa->spa_freeze_txg); } /* * Return the inflated asize for a logical write in bytes. This is used by the * DMU to calculate the space a logical write will require on disk. * If lsize is smaller than the largest physical block size allocatable on this * pool we use its value instead, since the write will end up using the whole * block anyway. */ uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) { if (lsize == 0) return (0); /* No inflation needed */ return (MAX(lsize, 1 << spa->spa_max_ashift) * spa_asize_inflation); } +/* + * Return the range of minimum allocation sizes for the normal allocation + * class. This can be used by external consumers of the DMU to estimate + * potential wasted capacity when setting the recordsize for an object. + * This is mainly for dRAID pools which always pad to a full stripe width. + */ +void +spa_get_min_alloc_range(spa_t *spa, uint64_t *min_alloc, uint64_t *max_alloc) +{ + *min_alloc = spa->spa_min_alloc; + *max_alloc = spa->spa_max_alloc; +} + /* * Return the amount of slop space in bytes. It is typically 1/32 of the pool * (3.2%), minus the embedded log space. On very small pools, it may be * slightly larger than this. On very large pools, it will be capped to * the value of spa_max_slop. The embedded log space is not included in * spa_dspace. By subtracting it, the usable space (per "zfs list") is a * constant 97% of the total space, regardless of metaslab size (assuming the * default spa_slop_shift=5 and a non-tiny pool). * * See the comment above spa_slop_shift for more details. */ uint64_t spa_get_slop_space(spa_t *spa) { uint64_t space = 0; uint64_t slop = 0; /* * Make sure spa_dedup_dspace has been set. */ if (spa->spa_dedup_dspace == ~0ULL) spa_update_dspace(spa); space = spa->spa_rdspace; slop = MIN(space >> spa_slop_shift, spa_max_slop); /* * Subtract the embedded log space, but no more than half the (3.2%) * unusable space. Note, the "no more than half" is only relevant if * zfs_embedded_slog_min_ms >> spa_slop_shift < 2, which is not true by * default. */ uint64_t embedded_log = metaslab_class_get_dspace(spa_embedded_log_class(spa)); embedded_log += metaslab_class_get_dspace( spa_special_embedded_log_class(spa)); slop -= MIN(embedded_log, slop >> 1); /* * Slop space should be at least spa_min_slop, but no more than half * the entire pool. */ slop = MAX(slop, MIN(space >> 1, spa_min_slop)); return (slop); } uint64_t spa_get_dspace(spa_t *spa) { return (spa->spa_dspace); } uint64_t spa_get_checkpoint_space(spa_t *spa) { return (spa->spa_checkpoint_info.sci_dspace); } void spa_update_dspace(spa_t *spa) { spa->spa_rdspace = metaslab_class_get_dspace(spa_normal_class(spa)); if (spa->spa_nonallocating_dspace > 0) { /* * Subtract the space provided by all non-allocating vdevs that * contribute to dspace. If a file is overwritten, its old * blocks are freed and new blocks are allocated. If there are * no snapshots of the file, the available space should remain * the same. The old blocks could be freed from the * non-allocating vdev, but the new blocks must be allocated on * other (allocating) vdevs. By reserving the entire size of * the non-allocating vdevs (including allocated space), we * ensure that there will be enough space on the allocating * vdevs for this file overwrite to succeed. * * Note that the DMU/DSL doesn't actually know or care * how much space is allocated (it does its own tracking * of how much space has been logically used). So it * doesn't matter that the data we are moving may be * allocated twice (on the old device and the new device). */ ASSERT3U(spa->spa_rdspace, >=, spa->spa_nonallocating_dspace); spa->spa_rdspace -= spa->spa_nonallocating_dspace; } spa->spa_dspace = spa->spa_rdspace + ddt_get_dedup_dspace(spa) + brt_get_dspace(spa); } /* * Return the failure mode that has been set to this pool. The default * behavior will be to block all I/Os when a complete failure occurs. */ uint64_t spa_get_failmode(spa_t *spa) { return (spa->spa_failmode); } boolean_t spa_suspended(spa_t *spa) { return (spa->spa_suspended != ZIO_SUSPEND_NONE); } uint64_t spa_version(spa_t *spa) { return (spa->spa_ubsync.ub_version); } boolean_t spa_deflate(spa_t *spa) { return (spa->spa_deflate); } metaslab_class_t * spa_normal_class(spa_t *spa) { return (spa->spa_normal_class); } metaslab_class_t * spa_log_class(spa_t *spa) { return (spa->spa_log_class); } metaslab_class_t * spa_embedded_log_class(spa_t *spa) { return (spa->spa_embedded_log_class); } metaslab_class_t * spa_special_class(spa_t *spa) { return (spa->spa_special_class); } metaslab_class_t * spa_special_embedded_log_class(spa_t *spa) { return (spa->spa_special_embedded_log_class); } metaslab_class_t * spa_dedup_class(spa_t *spa) { return (spa->spa_dedup_class); } boolean_t spa_special_has_ddt(spa_t *spa) { return (zfs_ddt_data_is_special && spa_has_special(spa)); } /* * Locate an appropriate allocation class */ metaslab_class_t * spa_preferred_class(spa_t *spa, const zio_t *zio) { metaslab_class_t *mc = zio->io_metaslab_class; boolean_t tried_dedup = (mc == spa_dedup_class(spa)); boolean_t tried_special = (mc == spa_special_class(spa)); const zio_prop_t *zp = &zio->io_prop; /* * Override object type for the purposes of selecting a storage class. * Primarily for DMU_OTN_ types where we can't explicitly control their * storage class; instead, choose a static type most closely matches * what we want. */ dmu_object_type_t objtype = zp->zp_storage_type == DMU_OT_NONE ? zp->zp_type : zp->zp_storage_type; /* * ZIL allocations determine their class in zio_alloc_zil(). */ ASSERT(objtype != DMU_OT_INTENT_LOG); if (DMU_OT_IS_DDT(objtype)) { if (spa_has_dedup(spa) && !tried_dedup && !tried_special) return (spa_dedup_class(spa)); else if (spa_special_has_ddt(spa) && !tried_special) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } /* Indirect blocks for user data can land in special if allowed */ if (zp->zp_level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) { if (zfs_user_indirect_is_special && spa_has_special(spa) && !tried_special) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } if (DMU_OT_IS_METADATA(objtype) || zp->zp_level > 0) { if (spa_has_special(spa) && !tried_special) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } /* * Allow small file or zvol blocks in special class if opted in by * the special_smallblk property. However, always leave a reserve of * zfs_special_class_metadata_reserve_pct exclusively for metadata. */ if ((DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL) && spa_has_special(spa) && !tried_special && zio->io_size <= zp->zp_zpl_smallblk) { metaslab_class_t *special = spa_special_class(spa); uint64_t alloc = metaslab_class_get_alloc(special); uint64_t space = metaslab_class_get_space(special); uint64_t limit = (space * (100 - zfs_special_class_metadata_reserve_pct)) / 100; if (alloc < limit) return (special); } return (spa_normal_class(spa)); } void spa_evicting_os_register(spa_t *spa, objset_t *os) { mutex_enter(&spa->spa_evicting_os_lock); list_insert_head(&spa->spa_evicting_os_list, os); mutex_exit(&spa->spa_evicting_os_lock); } void spa_evicting_os_deregister(spa_t *spa, objset_t *os) { mutex_enter(&spa->spa_evicting_os_lock); list_remove(&spa->spa_evicting_os_list, os); cv_broadcast(&spa->spa_evicting_os_cv); mutex_exit(&spa->spa_evicting_os_lock); } void spa_evicting_os_wait(spa_t *spa) { mutex_enter(&spa->spa_evicting_os_lock); while (!list_is_empty(&spa->spa_evicting_os_list)) cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); mutex_exit(&spa->spa_evicting_os_lock); dmu_buf_user_evict_wait(); } int spa_max_replication(spa_t *spa) { /* * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to * handle BPs with more than one DVA allocated. Set our max * replication level accordingly. */ if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) return (1); return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); } int spa_prev_software_version(spa_t *spa) { return (spa->spa_prev_software_version); } uint64_t spa_deadman_synctime(spa_t *spa) { return (spa->spa_deadman_synctime); } spa_autotrim_t spa_get_autotrim(spa_t *spa) { return (spa->spa_autotrim); } uint64_t spa_deadman_ziotime(spa_t *spa) { return (spa->spa_deadman_ziotime); } uint64_t spa_get_deadman_failmode(spa_t *spa) { return (spa->spa_deadman_failmode); } void spa_set_deadman_failmode(spa_t *spa, const char *failmode) { if (strcmp(failmode, "wait") == 0) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT; else if (strcmp(failmode, "continue") == 0) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_CONTINUE; else if (strcmp(failmode, "panic") == 0) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; else spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT; } void spa_set_deadman_ziotime(hrtime_t ns) { spa_t *spa = NULL; if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa->spa_deadman_ziotime = ns; mutex_exit(&spa_namespace_lock); } } void spa_set_deadman_synctime(hrtime_t ns) { spa_t *spa = NULL; if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa->spa_deadman_synctime = ns; mutex_exit(&spa_namespace_lock); } } uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { uint64_t asize = DVA_GET_ASIZE(dva); uint64_t dsize = asize; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (asize != 0 && spa->spa_deflate) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd != NULL) dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; } return (dsize); } uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); return (dsize); } uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); spa_config_exit(spa, SCL_VDEV, FTAG); return (dsize); } uint64_t spa_dirty_data(spa_t *spa) { return (spa->spa_dsl_pool->dp_dirty_total); } /* * ========================================================================== * SPA Import Progress Routines * ========================================================================== */ typedef struct spa_import_progress { uint64_t pool_guid; /* unique id for updates */ char *pool_name; spa_load_state_t spa_load_state; char *spa_load_notes; uint64_t mmp_sec_remaining; /* MMP activity check */ uint64_t spa_load_max_txg; /* rewind txg */ procfs_list_node_t smh_node; } spa_import_progress_t; spa_history_list_t *spa_import_progress_list = NULL; static int spa_import_progress_show_header(struct seq_file *f) { seq_printf(f, "%-20s %-14s %-14s %-12s %-16s %s\n", "pool_guid", "load_state", "multihost_secs", "max_txg", "pool_name", "notes"); return (0); } static int spa_import_progress_show(struct seq_file *f, void *data) { spa_import_progress_t *sip = (spa_import_progress_t *)data; seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %-16s %s\n", (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state, (u_longlong_t)sip->mmp_sec_remaining, (u_longlong_t)sip->spa_load_max_txg, (sip->pool_name ? sip->pool_name : "-"), (sip->spa_load_notes ? sip->spa_load_notes : "-")); return (0); } /* Remove oldest elements from list until there are no more than 'size' left */ static void spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size) { spa_import_progress_t *sip; while (shl->size > size) { sip = list_remove_head(&shl->procfs_list.pl_list); if (sip->pool_name) spa_strfree(sip->pool_name); if (sip->spa_load_notes) kmem_strfree(sip->spa_load_notes); kmem_free(sip, sizeof (spa_import_progress_t)); shl->size--; } IMPLY(size == 0, list_is_empty(&shl->procfs_list.pl_list)); } static void spa_import_progress_init(void) { spa_import_progress_list = kmem_zalloc(sizeof (spa_history_list_t), KM_SLEEP); spa_import_progress_list->size = 0; spa_import_progress_list->procfs_list.pl_private = spa_import_progress_list; procfs_list_install("zfs", NULL, "import_progress", 0644, &spa_import_progress_list->procfs_list, spa_import_progress_show, spa_import_progress_show_header, NULL, offsetof(spa_import_progress_t, smh_node)); } static void spa_import_progress_destroy(void) { spa_history_list_t *shl = spa_import_progress_list; procfs_list_uninstall(&shl->procfs_list); spa_import_progress_truncate(shl, 0); procfs_list_destroy(&shl->procfs_list); kmem_free(shl, sizeof (spa_history_list_t)); } int spa_import_progress_set_state(uint64_t pool_guid, spa_load_state_t load_state) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; int error = ENOENT; if (shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->spa_load_state = load_state; if (sip->spa_load_notes != NULL) { kmem_strfree(sip->spa_load_notes); sip->spa_load_notes = NULL; } error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } static void spa_import_progress_set_notes_impl(spa_t *spa, boolean_t log_dbgmsg, const char *fmt, va_list adx) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; uint64_t pool_guid = spa_guid(spa); if (shl->size == 0) return; char *notes = kmem_vasprintf(fmt, adx); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { if (sip->spa_load_notes != NULL) { kmem_strfree(sip->spa_load_notes); sip->spa_load_notes = NULL; } sip->spa_load_notes = notes; if (log_dbgmsg) zfs_dbgmsg("'%s' %s", sip->pool_name, notes); notes = NULL; break; } } mutex_exit(&shl->procfs_list.pl_lock); if (notes != NULL) kmem_strfree(notes); } void spa_import_progress_set_notes(spa_t *spa, const char *fmt, ...) { va_list adx; va_start(adx, fmt); spa_import_progress_set_notes_impl(spa, B_TRUE, fmt, adx); va_end(adx); } void spa_import_progress_set_notes_nolog(spa_t *spa, const char *fmt, ...) { va_list adx; va_start(adx, fmt); spa_import_progress_set_notes_impl(spa, B_FALSE, fmt, adx); va_end(adx); } int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; int error = ENOENT; if (shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->spa_load_max_txg = load_max_txg; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } int spa_import_progress_set_mmp_check(uint64_t pool_guid, uint64_t mmp_sec_remaining) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; int error = ENOENT; if (shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->mmp_sec_remaining = mmp_sec_remaining; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } /* * A new import is in progress, add an entry. */ void spa_import_progress_add(spa_t *spa) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; const char *poolname = NULL; sip = kmem_zalloc(sizeof (spa_import_progress_t), KM_SLEEP); sip->pool_guid = spa_guid(spa); (void) nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME, &poolname); if (poolname == NULL) poolname = spa_name(spa); sip->pool_name = spa_strdup(poolname); sip->spa_load_state = spa_load_state(spa); sip->spa_load_notes = NULL; mutex_enter(&shl->procfs_list.pl_lock); procfs_list_add(&shl->procfs_list, sip); shl->size++; mutex_exit(&shl->procfs_list.pl_lock); } void spa_import_progress_remove(uint64_t pool_guid) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { if (sip->pool_name) spa_strfree(sip->pool_name); if (sip->spa_load_notes) spa_strfree(sip->spa_load_notes); list_remove(&shl->procfs_list.pl_list, sip); shl->size--; kmem_free(sip, sizeof (spa_import_progress_t)); break; } } mutex_exit(&shl->procfs_list.pl_lock); } /* * ========================================================================== * Initialization and Termination * ========================================================================== */ static int spa_name_compare(const void *a1, const void *a2) { const spa_t *s1 = a1; const spa_t *s2 = a2; int s; s = strcmp(s1->spa_name, s2->spa_name); return (TREE_ISIGN(s)); } void spa_init(spa_mode_t mode) { mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), offsetof(spa_t, spa_avl)); avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); spa_mode_global = mode; #ifndef _KERNEL if (spa_mode_global != SPA_MODE_READ && dprintf_find_string("watch")) { struct sigaction sa; sa.sa_flags = SA_SIGINFO; sigemptyset(&sa.sa_mask); sa.sa_sigaction = arc_buf_sigsegv; if (sigaction(SIGSEGV, &sa, NULL) == -1) { perror("could not enable watchpoints: " "sigaction(SIGSEGV, ...) = "); } else { arc_watch = B_TRUE; } } #endif fm_init(); zfs_refcount_init(); unique_init(); zfs_btree_init(); metaslab_stat_init(); brt_init(); ddt_init(); zio_init(); dmu_init(); zil_init(); vdev_mirror_stat_init(); vdev_raidz_math_init(); vdev_file_init(); zfs_prop_init(); chksum_init(); zpool_prop_init(); zpool_feature_init(); vdev_prop_init(); l2arc_start(); scan_init(); qat_init(); spa_import_progress_init(); zap_init(); } void spa_fini(void) { l2arc_stop(); spa_evict_all(); vdev_file_fini(); vdev_mirror_stat_fini(); vdev_raidz_math_fini(); chksum_fini(); zil_fini(); dmu_fini(); zio_fini(); ddt_fini(); brt_fini(); metaslab_stat_fini(); zfs_btree_fini(); unique_fini(); zfs_refcount_fini(); fm_fini(); scan_fini(); qat_fini(); spa_import_progress_destroy(); zap_fini(); avl_destroy(&spa_namespace_avl); avl_destroy(&spa_spare_avl); avl_destroy(&spa_l2cache_avl); cv_destroy(&spa_namespace_cv); mutex_destroy(&spa_namespace_lock); mutex_destroy(&spa_spare_lock); mutex_destroy(&spa_l2cache_lock); } boolean_t spa_has_dedup(spa_t *spa) { return (spa->spa_dedup_class->mc_groups != 0); } /* * Return whether this pool has a dedicated slog device. No locking needed. * It's not a problem if the wrong answer is returned as it's only for * performance and not correctness. */ boolean_t spa_has_slogs(spa_t *spa) { return (spa->spa_log_class->mc_groups != 0); } boolean_t spa_has_special(spa_t *spa) { return (spa->spa_special_class->mc_groups != 0); } spa_log_state_t spa_get_log_state(spa_t *spa) { return (spa->spa_log_state); } void spa_set_log_state(spa_t *spa, spa_log_state_t state) { spa->spa_log_state = state; } boolean_t spa_is_root(spa_t *spa) { return (spa->spa_is_root); } boolean_t spa_writeable(spa_t *spa) { return (!!(spa->spa_mode & SPA_MODE_WRITE) && spa->spa_trust_config); } /* * Returns true if there is a pending sync task in any of the current * syncing txg, the current quiescing txg, or the current open txg. */ boolean_t spa_has_pending_synctask(spa_t *spa) { return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) || !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks)); } spa_mode_t spa_mode(spa_t *spa) { return (spa->spa_mode); } uint64_t spa_get_last_scrubbed_txg(spa_t *spa) { return (spa->spa_scrubbed_last_txg); } uint64_t spa_bootfs(spa_t *spa) { return (spa->spa_bootfs); } uint64_t spa_delegation(spa_t *spa) { return (spa->spa_delegation); } objset_t * spa_meta_objset(spa_t *spa) { return (spa->spa_meta_objset); } enum zio_checksum spa_dedup_checksum(spa_t *spa) { return (spa->spa_dedup_checksum); } /* * Reset pool scan stat per scan pass (or reboot). */ void spa_scan_stat_init(spa_t *spa) { /* data not stored on disk */ spa->spa_scan_pass_start = gethrestime_sec(); if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan)) spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start; else spa->spa_scan_pass_scrub_pause = 0; if (dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) spa->spa_scan_pass_errorscrub_pause = spa->spa_scan_pass_start; else spa->spa_scan_pass_errorscrub_pause = 0; spa->spa_scan_pass_scrub_spent_paused = 0; spa->spa_scan_pass_exam = 0; spa->spa_scan_pass_issued = 0; // error scrub stats spa->spa_scan_pass_errorscrub_spent_paused = 0; } /* * Get scan stats for zpool status reports */ int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) { dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; if (scn == NULL || (scn->scn_phys.scn_func == POOL_SCAN_NONE && scn->errorscrub_phys.dep_func == POOL_SCAN_NONE)) return (SET_ERROR(ENOENT)); memset(ps, 0, sizeof (pool_scan_stat_t)); /* data stored on disk */ ps->pss_func = scn->scn_phys.scn_func; ps->pss_state = scn->scn_phys.scn_state; ps->pss_start_time = scn->scn_phys.scn_start_time; ps->pss_end_time = scn->scn_phys.scn_end_time; ps->pss_to_examine = scn->scn_phys.scn_to_examine; ps->pss_examined = scn->scn_phys.scn_examined; ps->pss_skipped = scn->scn_phys.scn_skipped; ps->pss_processed = scn->scn_phys.scn_processed; ps->pss_errors = scn->scn_phys.scn_errors; /* data not stored on disk */ ps->pss_pass_exam = spa->spa_scan_pass_exam; ps->pss_pass_start = spa->spa_scan_pass_start; ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause; ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused; ps->pss_pass_issued = spa->spa_scan_pass_issued; ps->pss_issued = scn->scn_issued_before_pass + spa->spa_scan_pass_issued; /* error scrub data stored on disk */ ps->pss_error_scrub_func = scn->errorscrub_phys.dep_func; ps->pss_error_scrub_state = scn->errorscrub_phys.dep_state; ps->pss_error_scrub_start = scn->errorscrub_phys.dep_start_time; ps->pss_error_scrub_end = scn->errorscrub_phys.dep_end_time; ps->pss_error_scrub_examined = scn->errorscrub_phys.dep_examined; ps->pss_error_scrub_to_be_examined = scn->errorscrub_phys.dep_to_examine; /* error scrub data not stored on disk */ ps->pss_pass_error_scrub_pause = spa->spa_scan_pass_errorscrub_pause; return (0); } int spa_maxblocksize(spa_t *spa) { if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) return (SPA_MAXBLOCKSIZE); else return (SPA_OLD_MAXBLOCKSIZE); } /* * Returns the txg that the last device removal completed. No indirect mappings * have been added since this txg. */ uint64_t spa_get_last_removal_txg(spa_t *spa) { uint64_t vdevid; uint64_t ret = -1ULL; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* * sr_prev_indirect_vdev is only modified while holding all the * config locks, so it is sufficient to hold SCL_VDEV as reader when * examining it. */ vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev; while (vdevid != -1ULL) { vdev_t *vd = vdev_lookup_top(spa, vdevid); vdev_indirect_births_t *vib = vd->vdev_indirect_births; ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); /* * If the removal did not remap any data, we don't care. */ if (vdev_indirect_births_count(vib) != 0) { ret = vdev_indirect_births_last_entry_txg(vib); break; } vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev; } spa_config_exit(spa, SCL_VDEV, FTAG); IMPLY(ret != -1ULL, spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); return (ret); } int spa_maxdnodesize(spa_t *spa) { if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) return (DNODE_MAX_SIZE); else return (DNODE_MIN_SIZE); } boolean_t spa_multihost(spa_t *spa) { return (spa->spa_multihost ? B_TRUE : B_FALSE); } uint32_t spa_get_hostid(spa_t *spa) { return (spa->spa_hostid); } boolean_t spa_trust_config(spa_t *spa) { return (spa->spa_trust_config); } uint64_t spa_missing_tvds_allowed(spa_t *spa) { return (spa->spa_missing_tvds_allowed); } space_map_t * spa_syncing_log_sm(spa_t *spa) { return (spa->spa_syncing_log_sm); } void spa_set_missing_tvds(spa_t *spa, uint64_t missing) { spa->spa_missing_tvds = missing; } /* * Return the pool state string ("ONLINE", "DEGRADED", "SUSPENDED", etc). */ const char * spa_state_to_name(spa_t *spa) { ASSERT3P(spa, !=, NULL); /* * it is possible for the spa to exist, without root vdev * as the spa transitions during import/export */ vdev_t *rvd = spa->spa_root_vdev; if (rvd == NULL) { return ("TRANSITIONING"); } vdev_state_t state = rvd->vdev_state; vdev_aux_t aux = rvd->vdev_stat.vs_aux; if (spa_suspended(spa)) return ("SUSPENDED"); switch (state) { case VDEV_STATE_CLOSED: case VDEV_STATE_OFFLINE: return ("OFFLINE"); case VDEV_STATE_REMOVED: return ("REMOVED"); case VDEV_STATE_CANT_OPEN: if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG) return ("FAULTED"); else if (aux == VDEV_AUX_SPLIT_POOL) return ("SPLIT"); else return ("UNAVAIL"); case VDEV_STATE_FAULTED: return ("FAULTED"); case VDEV_STATE_DEGRADED: return ("DEGRADED"); case VDEV_STATE_HEALTHY: return ("ONLINE"); default: break; } return ("UNKNOWN"); } boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { if (!vdev_is_spacemap_addressable(rvd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t spa_has_checkpoint(spa_t *spa) { return (spa->spa_checkpoint_txg != 0); } boolean_t spa_importing_readonly_checkpoint(spa_t *spa) { return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) && spa->spa_mode == SPA_MODE_READ); } uint64_t spa_min_claim_txg(spa_t *spa) { uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg; if (checkpoint_txg != 0) return (checkpoint_txg + 1); return (spa->spa_first_txg); } /* * If there is a checkpoint, async destroys may consume more space from * the pool instead of freeing it. In an attempt to save the pool from * getting suspended when it is about to run out of space, we stop * processing async destroys. */ boolean_t spa_suspend_async_destroy(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); uint64_t unreserved = dsl_pool_unreserved_space(dp, ZFS_SPACE_CHECK_EXTRA_RESERVED); uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; uint64_t avail = (unreserved > used) ? (unreserved - used) : 0; if (spa_has_checkpoint(spa) && avail == 0) return (B_TRUE); return (B_FALSE); } #if defined(_KERNEL) int param_set_deadman_failmode_common(const char *val) { spa_t *spa = NULL; char *p; if (val == NULL) return (SET_ERROR(EINVAL)); if ((p = strchr(val, '\n')) != NULL) *p = '\0'; if (strcmp(val, "wait") != 0 && strcmp(val, "continue") != 0 && strcmp(val, "panic")) return (SET_ERROR(EINVAL)); if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa_set_deadman_failmode(spa, val); mutex_exit(&spa_namespace_lock); } return (0); } #endif /* Namespace manipulation */ EXPORT_SYMBOL(spa_lookup); EXPORT_SYMBOL(spa_add); EXPORT_SYMBOL(spa_remove); EXPORT_SYMBOL(spa_next); /* Refcount functions */ EXPORT_SYMBOL(spa_open_ref); EXPORT_SYMBOL(spa_close); EXPORT_SYMBOL(spa_refcount_zero); /* Pool configuration lock */ EXPORT_SYMBOL(spa_config_tryenter); EXPORT_SYMBOL(spa_config_enter); EXPORT_SYMBOL(spa_config_exit); EXPORT_SYMBOL(spa_config_held); /* Pool vdev add/remove lock */ EXPORT_SYMBOL(spa_vdev_enter); EXPORT_SYMBOL(spa_vdev_exit); /* Pool vdev state change lock */ EXPORT_SYMBOL(spa_vdev_state_enter); EXPORT_SYMBOL(spa_vdev_state_exit); /* Accessor functions */ EXPORT_SYMBOL(spa_shutting_down); EXPORT_SYMBOL(spa_get_dsl); EXPORT_SYMBOL(spa_get_rootblkptr); EXPORT_SYMBOL(spa_set_rootblkptr); EXPORT_SYMBOL(spa_altroot); EXPORT_SYMBOL(spa_sync_pass); EXPORT_SYMBOL(spa_name); EXPORT_SYMBOL(spa_guid); EXPORT_SYMBOL(spa_last_synced_txg); EXPORT_SYMBOL(spa_first_txg); EXPORT_SYMBOL(spa_syncing_txg); EXPORT_SYMBOL(spa_version); EXPORT_SYMBOL(spa_state); EXPORT_SYMBOL(spa_load_state); EXPORT_SYMBOL(spa_freeze_txg); +EXPORT_SYMBOL(spa_get_min_alloc_range); /* for Lustre */ EXPORT_SYMBOL(spa_get_dspace); EXPORT_SYMBOL(spa_update_dspace); EXPORT_SYMBOL(spa_deflate); EXPORT_SYMBOL(spa_normal_class); EXPORT_SYMBOL(spa_log_class); EXPORT_SYMBOL(spa_special_class); EXPORT_SYMBOL(spa_preferred_class); EXPORT_SYMBOL(spa_max_replication); EXPORT_SYMBOL(spa_prev_software_version); EXPORT_SYMBOL(spa_get_failmode); EXPORT_SYMBOL(spa_suspended); EXPORT_SYMBOL(spa_bootfs); EXPORT_SYMBOL(spa_delegation); EXPORT_SYMBOL(spa_meta_objset); EXPORT_SYMBOL(spa_maxblocksize); EXPORT_SYMBOL(spa_maxdnodesize); /* Miscellaneous support routines */ EXPORT_SYMBOL(spa_guid_exists); EXPORT_SYMBOL(spa_strdup); EXPORT_SYMBOL(spa_strfree); EXPORT_SYMBOL(spa_generate_guid); EXPORT_SYMBOL(snprintf_blkptr); EXPORT_SYMBOL(spa_freeze); EXPORT_SYMBOL(spa_upgrade); EXPORT_SYMBOL(spa_evict_all); EXPORT_SYMBOL(spa_lookup_by_guid); EXPORT_SYMBOL(spa_has_spare); EXPORT_SYMBOL(dva_get_dsize_sync); EXPORT_SYMBOL(bp_get_dsize_sync); EXPORT_SYMBOL(bp_get_dsize); EXPORT_SYMBOL(spa_has_slogs); EXPORT_SYMBOL(spa_is_root); EXPORT_SYMBOL(spa_writeable); EXPORT_SYMBOL(spa_mode); EXPORT_SYMBOL(spa_namespace_lock); EXPORT_SYMBOL(spa_trust_config); EXPORT_SYMBOL(spa_missing_tvds_allowed); EXPORT_SYMBOL(spa_set_missing_tvds); EXPORT_SYMBOL(spa_state_to_name); EXPORT_SYMBOL(spa_importing_readonly_checkpoint); EXPORT_SYMBOL(spa_min_claim_txg); EXPORT_SYMBOL(spa_suspend_async_destroy); EXPORT_SYMBOL(spa_has_checkpoint); EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable); ZFS_MODULE_PARAM(zfs, zfs_, flags, UINT, ZMOD_RW, "Set additional debugging flags"); ZFS_MODULE_PARAM(zfs, zfs_, recover, INT, ZMOD_RW, "Set to attempt to recover from fatal errors"); ZFS_MODULE_PARAM(zfs, zfs_, free_leak_on_eio, INT, ZMOD_RW, "Set to ignore IO errors during free and permanently leak the space"); ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, checktime_ms, U64, ZMOD_RW, "Dead I/O check interval in milliseconds"); ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, enabled, INT, ZMOD_RW, "Enable deadman timer"); ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, UINT, ZMOD_RW, "SPA size estimate multiplication factor"); ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW, "Place DDT data into the special class"); ZFS_MODULE_PARAM(zfs, zfs_, user_indirect_is_special, INT, ZMOD_RW, "Place user data indirect blocks into the special class"); ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode, param_set_deadman_failmode, param_get_charp, ZMOD_RW, "Failmode for deadman timer"); ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, synctime_ms, param_set_deadman_synctime, spl_param_get_u64, ZMOD_RW, "Pool sync expiration time in milliseconds"); ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, ziotime_ms, param_set_deadman_ziotime, spl_param_get_u64, ZMOD_RW, "IO expiration time in milliseconds"); ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, UINT, ZMOD_RW, "Small file blocks in special vdevs depends on this much " "free space available"); ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, param_get_uint, ZMOD_RW, "Reserved free space in pool"); ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW, "Number of allocators per spa"); ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW, "Minimum number of CPUs per allocators"); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 196289a5922b..c8d7280387a2 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1,6796 +1,6797 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. * Copyright (c) 2021, 2025, Klara, Inc. * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" /* * One metaslab from each (normal-class) vdev is used by the ZIL. These are * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are * part of the spa_embedded_log_class. The metaslab with the most free space * in each vdev is selected for this purpose when the pool is opened (or a * vdev is added). See vdev_metaslab_init(). * * Log blocks can be allocated from the following locations. Each one is tried * in order until the allocation succeeds: * 1. dedicated log vdevs, aka "slog" (spa_log_class) * 2. embedded slog metaslabs (spa_embedded_log_class) * 3. other metaslabs in normal vdevs (spa_normal_class) * * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer * than this number of metaslabs in the vdev. This ensures that we don't set * aside an unreasonable amount of space for the ZIL. If set to less than * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced * (by more than 1<vdev_path != NULL) { zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, vd->vdev_path, buf); } else { zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid, buf); } } void vdev_dbgmsg_print_tree(vdev_t *vd, int indent) { char state[20]; if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { zfs_dbgmsg("%*svdev %llu: %s", indent, "", (u_longlong_t)vd->vdev_id, vd->vdev_ops->vdev_op_type); return; } switch (vd->vdev_state) { case VDEV_STATE_UNKNOWN: (void) snprintf(state, sizeof (state), "unknown"); break; case VDEV_STATE_CLOSED: (void) snprintf(state, sizeof (state), "closed"); break; case VDEV_STATE_OFFLINE: (void) snprintf(state, sizeof (state), "offline"); break; case VDEV_STATE_REMOVED: (void) snprintf(state, sizeof (state), "removed"); break; case VDEV_STATE_CANT_OPEN: (void) snprintf(state, sizeof (state), "can't open"); break; case VDEV_STATE_FAULTED: (void) snprintf(state, sizeof (state), "faulted"); break; case VDEV_STATE_DEGRADED: (void) snprintf(state, sizeof (state), "degraded"); break; case VDEV_STATE_HEALTHY: (void) snprintf(state, sizeof (state), "healthy"); break; default: (void) snprintf(state, sizeof (state), "", (uint_t)vd->vdev_state); } zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, vd->vdev_islog ? " (log)" : "", (u_longlong_t)vd->vdev_guid, vd->vdev_path ? vd->vdev_path : "N/A", state); for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); } char * vdev_rt_name(vdev_t *vd, const char *name) { return (kmem_asprintf("{spa=%s vdev_guid=%llu %s}", spa_name(vd->vdev_spa), (u_longlong_t)vd->vdev_guid, name)); } static char * vdev_rt_name_dtl(vdev_t *vd, const char *name, vdev_dtl_type_t dtl_type) { return (kmem_asprintf("{spa=%s vdev_guid=%llu %s[%d]}", spa_name(vd->vdev_spa), (u_longlong_t)vd->vdev_guid, name, dtl_type)); } /* * Virtual device management. */ static vdev_ops_t *const vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_draid_ops, &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, &vdev_disk_ops, &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, &vdev_indirect_ops, NULL }; /* * Given a vdev type, return the appropriate ops vector. */ static vdev_ops_t * vdev_getops(const char *type) { vdev_ops_t *ops, *const *opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) break; return (ops); } /* * Given a vdev and a metaslab class, find which metaslab group we're * interested in. All vdevs may belong to two different metaslab classes. * Dedicated slog devices use only the primary metaslab group, rather than a * separate log group. For embedded slogs, vdev_log_mg will be non-NULL and * will point to a metaslab group of either embedded_log_class (for normal * vdevs) or special_embedded_log_class (for special vdevs). */ metaslab_group_t * vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) { if ((mc == spa_embedded_log_class(vd->vdev_spa) || mc == spa_special_embedded_log_class(vd->vdev_spa)) && vd->vdev_log_mg != NULL) return (vd->vdev_log_mg); else return (vd->vdev_mg); } void vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) { (void) vd, (void) remain_rs; physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; } /* * Derive the enumerated allocation bias from string input. * String origin is either the per-vdev zap or zpool(8). */ static vdev_alloc_bias_t vdev_derive_alloc_bias(const char *bias) { vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) alloc_bias = VDEV_BIAS_LOG; else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) alloc_bias = VDEV_BIAS_SPECIAL; else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) alloc_bias = VDEV_BIAS_DEDUP; return (alloc_bias); } uint64_t vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg) { ASSERT0(asize % (1ULL << vd->vdev_top->vdev_ashift)); uint64_t csize, psize = asize; for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_asize_to_psize_txg(vd->vdev_child[c], asize, txg); psize = MIN(psize, csize); } return (psize); } /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg); asize = MAX(asize, csize); } return (asize); } uint64_t vdev_default_min_asize(vdev_t *vd) { return (vd->vdev_min_asize); } /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to * replace or attach devices which don't have the same physical size but * can still satisfy the same number of allocations. */ uint64_t vdev_get_min_asize(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; /* * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) return (vd->vdev_asize); /* * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ if (vd == vd->vdev_top) return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift, uint64_t)); return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } void vdev_set_min_asize(vdev_t *vd) { vd->vdev_min_asize = vdev_get_min_asize(vd); for (int c = 0; c < vd->vdev_children; c++) vdev_set_min_asize(vd->vdev_child[c]); } /* * Get the minimal allocation size for the top-level vdev. */ uint64_t vdev_get_min_alloc(vdev_t *vd) { uint64_t min_alloc = 1ULL << vd->vdev_ashift; if (vd->vdev_ops->vdev_op_min_alloc != NULL) min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd); return (min_alloc); } /* * Get the parity level for a top-level vdev. */ uint64_t vdev_get_nparity(vdev_t *vd) { uint64_t nparity = 0; if (vd->vdev_ops->vdev_op_nparity != NULL) nparity = vd->vdev_ops->vdev_op_nparity(vd); return (nparity); } static int vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; uint64_t objid; int err; if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { return (EINVAL); } err = zap_lookup(mos, objid, vdev_prop_to_name(prop), sizeof (uint64_t), 1, value); if (err == ENOENT) *value = vdev_prop_default_numeric(prop); return (err); } /* * Get the number of data disks for a top-level vdev. */ uint64_t vdev_get_ndisks(vdev_t *vd) { uint64_t ndisks = 1; if (vd->vdev_ops->vdev_op_ndisks != NULL) ndisks = vd->vdev_ops->vdev_op_ndisks(vd); return (ndisks); } vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (vdev < rvd->vdev_children) { ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); } return (NULL); } vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); return (NULL); } static int vdev_count_leaves_impl(vdev_t *vd) { int n = 0; if (vd->vdev_ops->vdev_op_leaf) return (1); for (int c = 0; c < vd->vdev_children; c++) n += vdev_count_leaves_impl(vd->vdev_child[c]); return (n); } int vdev_count_leaves(spa_t *spa) { int rc; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); rc = vdev_count_leaves_impl(spa->spa_root_vdev); spa_config_exit(spa, SCL_VDEV, FTAG); return (rc); } void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT0P(cvd->vdev_parent); cvd->vdev_parent = pvd; if (pvd == NULL) return; ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); oldsize = pvd->vdev_children * sizeof (vdev_t *); pvd->vdev_children = MAX(pvd->vdev_children, id + 1); newsize = pvd->vdev_children * sizeof (vdev_t *); newchild = kmem_alloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { memcpy(newchild, pvd->vdev_child, oldsize); kmem_free(pvd->vdev_child, oldsize); } pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; pvd->vdev_nonrot &= cvd->vdev_nonrot; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT0P(cvd->vdev_top->vdev_parent->vdev_parent); /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; if (cvd->vdev_ops->vdev_op_leaf) { list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd); cvd->vdev_spa->spa_leaf_list_gen++; } } void vdev_remove_child(vdev_t *pvd, vdev_t *cvd) { int c; uint_t id = cvd->vdev_id; ASSERT(cvd->vdev_parent == pvd); if (pvd == NULL) return; ASSERT(id < pvd->vdev_children); ASSERT(pvd->vdev_child[id] == cvd); pvd->vdev_child[id] = NULL; cvd->vdev_parent = NULL; for (c = 0; c < pvd->vdev_children; c++) if (pvd->vdev_child[c]) break; if (c == pvd->vdev_children) { kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); pvd->vdev_child = NULL; pvd->vdev_children = 0; } if (cvd->vdev_ops->vdev_op_leaf) { spa_t *spa = cvd->vdev_spa; list_remove(&spa->spa_leaf_list, cvd); spa->spa_leaf_list_gen++; } /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; } /* * Remove any holes in the child array. */ void vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (oldc == 0) return; for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; if (newc > 0) { newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP); for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; } } } else { newchild = NULL; } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); pvd->vdev_child = newchild; pvd->vdev_children = newc; } /* * Allocate and minimally initialize a vdev_t. */ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; vdev_indirect_config_t *vic; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); vic = &vd->vdev_indirect_config; if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; spa->spa_load_guid = spa_generate_load_guid(); } if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } vd->vdev_spa = spa; vd->vdev_id = id; vd->vdev_guid = guid; vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); vic->vic_prev_indirect_vdev = UINT64_MAX; rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); vd->vdev_obsolete_segments = zfs_range_tree_create_flags( NULL, ZFS_RANGE_SEG64, NULL, 0, 0, ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_obsolete_segments")); /* * Initialize rate limit structs for events. We rate limit ZIO delay * and checksum events so that we don't overwhelm ZED with thousands * of events when a disk is acting up. */ zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_dio_verify_rl, &zfs_dio_write_verify_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksum_events_per_second, 1); /* * Default Thresholds for tuning ZED */ vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N); vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T); vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N); vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T); vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N); vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T); list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); list_link_init(&vd->vdev_initialize_node); list_link_init(&vd->vdev_leaf_node); list_link_init(&vd->vdev_trim_node); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = zfs_range_tree_create_flags( NULL, ZFS_RANGE_SEG64, NULL, 0, 0, ZFS_RT_F_DYN_NAME, vdev_rt_name_dtl(vd, "vdev_dtl", t)); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); return (vd); } /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. */ int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; const char *type; uint64_t guid = 0, islog; vdev_t *vd; vdev_indirect_config_t *vic; const char *tmp = NULL; int rc; vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; boolean_t top_level = (parent && !parent->vdev_parent); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. * Otherwise, vdev_alloc_common() will generate one for us. */ if (alloctype == VDEV_ALLOC_LOAD) { uint64_t label_id; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. */ islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); if (top_level && alloctype == VDEV_ALLOC_ADD) { const char *bias; /* * If creating a top-level vdev, check for allocation * classes input. */ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); /* spa_vdev_add() expects feature to be enabled */ if (spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { return (SET_ERROR(ENOTSUP)); } } /* spa_vdev_add() expects feature to be enabled */ if (ops == &vdev_draid_ops && spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { return (SET_ERROR(ENOTSUP)); } } /* * Initialize the vdev specific data. This is done before calling * vdev_alloc_common() since it may fail and this simplifies the * error reporting and cleanup code paths. */ void *tsd = NULL; if (ops->vdev_op_init != NULL) { rc = ops->vdev_op_init(spa, nv, &tsd); if (rc != 0) { return (rc); } } vd = vdev_alloc_common(spa, id, guid, ops); vd->vdev_tsd = tsd; vd->vdev_islog = islog; if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0) vd->vdev_path = spa_strdup(tmp); /* * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a * fault on a vdev and want it to persist across imports (like with * zpool offline -f). */ rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp); if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_faulted = 1; vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0) vd->vdev_devid = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0) vd->vdev_physpath = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &tmp) == 0) vd->vdev_enc_sysfs_path = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0) vd->vdev_fru = spa_strdup(tmp); /* * Set the whole_disk property. If it's not specified, leave the value * as -1. */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; vic = &vd->vdev_indirect_config; ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); ASSERT0(vic->vic_births_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, &vic->vic_births_object); ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, &vic->vic_prev_indirect_vdev); /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present); /* * Get the alignment requirement. Ignore pool ashift for vdev * attach case. */ if (alloctype != VDEV_ALLOC_ATTACH) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); } else { vd->vdev_attaching = B_TRUE; } /* * Retrieve the vdev creation time. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); if (vd->vdev_ops == &vdev_root_ops && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP, &vd->vdev_root_zap); } /* * If we're a top-level vdev, try to load the allocation parameters. */ if (top_level && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, &vd->vdev_noalloc); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); vd->vdev_rz_expanding = nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); } else { ASSERT0(vd->vdev_top_zap); } if (top_level && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); /* Note: metaslab_group_create() is now deferred */ } if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); } else { ASSERT0(vd->vdev_leaf_zap); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } if (alloctype == VDEV_ALLOC_ROOTPOOL) { uint64_t spare = 0; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && spare) spa_spare_add(vd); } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, &vd->vdev_rebuild_txg); if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) vdev_defer_resilver(vd); /* * In general, when importing a pool we want to ignore the * persistent fault state, as the diagnosis made on another * system may not be valid in the current context. The only * exception is if we forced a vdev to a persistently faulted * state with 'zpool offline -f'. The persistent fault will * remain across imports until cleared. * * Local vdevs will remain in the faulted state. */ if (spa_load_state(spa) == SPA_LOAD_OPEN || spa_load_state(spa) == SPA_LOAD_IMPORT) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { const char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && strcmp(aux, "external") == 0) vd->vdev_label_aux = VDEV_AUX_EXTERNAL; else vd->vdev_faulted = 0ULL; } } } if (top_level && (ops == &vdev_raidz_ops || ops == &vdev_draid_ops)) vd->vdev_autosit = vdev_prop_default_numeric(VDEV_PROP_AUTOSIT); /* * Add ourselves to the parent's list of children. */ vdev_add_child(parent, vd); *vdp = vd; return (0); } void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT0P(vd->vdev_initialize_thread); ASSERT0P(vd->vdev_trim_thread); ASSERT0P(vd->vdev_autotrim_thread); ASSERT0P(vd->vdev_rebuild_thread); /* * Scan queues are normally destroyed at the end of a scan. If the * queue exists here, that implies the vdev is being removed while * the scan is still running. */ if (vd->vdev_scan_io_queue != NULL) { mutex_enter(&vd->vdev_scan_io_queue_lock); dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); vd->vdev_scan_io_queue = NULL; mutex_exit(&vd->vdev_scan_io_queue_lock); } /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT0P(vd->vdev_child); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); if (vd->vdev_ops->vdev_op_fini != NULL) vd->vdev_ops->vdev_op_fini(vd); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; } if (vd->vdev_log_mg != NULL) { ASSERT0(vd->vdev_ms_count); metaslab_group_destroy(vd->vdev_log_mg); vd->vdev_log_mg = NULL; } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. */ vdev_remove_child(vd->vdev_parent, vd); ASSERT0P(vd->vdev_parent); ASSERT(!list_link_active(&vd->vdev_leaf_node)); /* * Clean up vdev structure. */ vdev_queue_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); if (vd->vdev_devid) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); if (vd->vdev_enc_sysfs_path) spa_strfree(vd->vdev_enc_sysfs_path); if (vd->vdev_fru) spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); if (vd->vdev_prev_histo) kmem_free(vd->vdev_prev_histo, sizeof (uint64_t) * VDEV_L_HISTO_BUCKETS); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { zfs_range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); zfs_range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); EQUIV(vd->vdev_indirect_births != NULL, vd->vdev_indirect_mapping != NULL); if (vd->vdev_indirect_births != NULL) { vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vdev_indirect_births_close(vd->vdev_indirect_births); } if (vd->vdev_obsolete_sm != NULL) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; } zfs_range_tree_destroy(vd->vdev_obsolete_segments); rw_destroy(&vd->vdev_indirect_rwlock); mutex_destroy(&vd->vdev_obsolete_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); mutex_destroy(&vd->vdev_initialize_lock); mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); cv_destroy(&vd->vdev_initialize_cv); mutex_destroy(&vd->vdev_trim_lock); mutex_destroy(&vd->vdev_autotrim_lock); mutex_destroy(&vd->vdev_trim_io_lock); cv_destroy(&vd->vdev_trim_cv); cv_destroy(&vd->vdev_autotrim_cv); cv_destroy(&vd->vdev_autotrim_kick_cv); cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); cv_destroy(&vd->vdev_rebuild_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_deadman_rl); zfs_ratelimit_fini(&vd->vdev_dio_verify_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; kmem_free(vd, sizeof (vdev_t)); } /* * Transfer top-level vdev state from svd to tvd. */ static void vdev_top_transfer(vdev_t *svd, vdev_t *tvd) { spa_t *spa = svd->vdev_spa; metaslab_t *msp; vdev_t *vd; int t; ASSERT(tvd == tvd->vdev_top); tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; tvd->vdev_top_zap = svd->vdev_top_zap; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; svd->vdev_top_zap = 0; if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); if (tvd->vdev_log_mg) ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_log_mg = svd->vdev_log_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; svd->vdev_log_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; if (tvd->vdev_log_mg != NULL) tvd->vdev_log_mg->mg_vd = tvd; tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; tvd->vdev_alloc_bias = svd->vdev_alloc_bias; svd->vdev_alloc_bias = VDEV_BIAS_NONE; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; svd->vdev_stat.vs_alloc = 0; svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; /* * State which may be set on a top-level vdev that's in the * process of being removed. */ ASSERT0(tvd->vdev_indirect_config.vic_births_object); ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); ASSERT0P(tvd->vdev_indirect_mapping); ASSERT0P(tvd->vdev_indirect_births); ASSERT0P(tvd->vdev_obsolete_sm); ASSERT0(tvd->vdev_noalloc); ASSERT0(tvd->vdev_removing); ASSERT0(tvd->vdev_rebuilding); tvd->vdev_noalloc = svd->vdev_noalloc; tvd->vdev_removing = svd->vdev_removing; tvd->vdev_rebuilding = svd->vdev_rebuilding; tvd->vdev_rebuild_config = svd->vdev_rebuild_config; tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; zfs_range_tree_swap(&svd->vdev_obsolete_segments, &tvd->vdev_obsolete_segments); tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; svd->vdev_indirect_config.vic_mapping_object = 0; svd->vdev_indirect_config.vic_births_object = 0; svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; svd->vdev_indirect_mapping = NULL; svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; svd->vdev_noalloc = 0; svd->vdev_removing = 0; svd->vdev_rebuilding = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } if (list_link_active(&svd->vdev_state_dirty_node)) { vdev_state_clean(svd); vdev_state_dirty(tvd); } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; dsl_scan_io_queue_vdev_xfer(svd, tvd); } static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { if (vd == NULL) return; vd->vdev_top = tvd; for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } /* * Add a mirror/replacing vdev above an existing vdev. There is no need to * call .vdev_op_init() since mirror/replacing vdevs do not have private state. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) { spa_t *spa = cvd->vdev_spa; vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_psize = cvd->vdev_psize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; mvd->vdev_nonrot = cvd->vdev_nonrot; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); return (mvd); } /* * Remove a 1-way mirror/replacing vdev from the tree. */ void vdev_remove_parent(vdev_t *cvd) { vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; /* * If pool not set for autoexpand, we need to also preserve * mvd's asize to prevent automatic expansion of cvd. * Otherwise if we are adjusting the mirror by attaching and * detaching children of non-uniform sizes, the mirror could * autoexpand, unexpectedly requiring larger devices to * re-establish the mirror. */ if (!cvd->vdev_spa->spa_autoexpand) cvd->vdev_asize = mvd->vdev_asize; } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); ASSERT0(mvd->vdev_children); vdev_free(mvd); } /* * Choose GCD for spa_gcd_alloc. */ static uint64_t vdev_gcd(uint64_t a, uint64_t b) { while (b != 0) { uint64_t t = b; b = a % b; a = t; } return (a); } /* * Set spa_min_alloc and spa_gcd_alloc. */ static void vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc) { if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; - if (spa->spa_gcd_alloc == INT_MAX) { + + if (min_alloc > spa->spa_max_alloc) + spa->spa_max_alloc = min_alloc; + + if (spa->spa_gcd_alloc == INT_MAX) spa->spa_gcd_alloc = min_alloc; - } else { - spa->spa_gcd_alloc = vdev_gcd(min_alloc, - spa->spa_gcd_alloc); - } + else + spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc); } void vdev_metaslab_group_create(vdev_t *vd) { spa_t *spa = vd->vdev_spa; /* * metaslab_group_create was delayed until allocation bias was available */ if (vd->vdev_mg == NULL) { metaslab_class_t *mc; if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) vd->vdev_alloc_bias = VDEV_BIAS_LOG; ASSERT3U(vd->vdev_islog, ==, (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); switch (vd->vdev_alloc_bias) { case VDEV_BIAS_LOG: mc = spa_log_class(spa); break; case VDEV_BIAS_SPECIAL: mc = spa_special_class(spa); break; case VDEV_BIAS_DEDUP: mc = spa_dedup_class(spa); break; default: mc = spa_normal_class(spa); } vd->vdev_mg = metaslab_group_create(mc, vd); if (!vd->vdev_islog) { if (mc == spa_special_class(spa)) { vd->vdev_log_mg = metaslab_group_create( spa_special_embedded_log_class(spa), vd); } else { vd->vdev_log_mg = metaslab_group_create( spa_embedded_log_class(spa), vd); } } /* * The spa ashift min/max only apply for the normal metaslab * class. Class destination is late binding so ashift boundary * setting had to wait until now. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; - uint64_t min_alloc = vdev_get_min_alloc(vd); - vdev_spa_set_alloc(spa, min_alloc); + vdev_spa_set_alloc(spa, vdev_get_min_alloc(vd)); } } } void vdev_update_nonallocating_space(vdev_t *vd, boolean_t add) { spa_t *spa = vd->vdev_spa; if (vd->vdev_mg->mg_class != spa_normal_class(spa)) return; uint64_t raw_space = metaslab_group_get_space(vd->vdev_mg); uint64_t dspace = spa_deflate(spa) ? vdev_deflated_space(vd, raw_space) : raw_space; if (add) { spa->spa_nonallocating_dspace += dspace; } else { ASSERT3U(spa->spa_nonallocating_dspace, >=, dspace); spa->spa_nonallocating_dspace -= dspace; } } int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; boolean_t expanding = (oldc != 0); ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); /* * This vdev is not being allocated from yet or is a hole. */ if (vd->vdev_ms_shift == 0) return (0); ASSERT(!vd->vdev_ishole); ASSERT(oldc <= newc); mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (expanding) { memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp)); vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; /* * Weighting algorithms can depend on the number of metaslabs in the * vdev. In order to ensure that all weights are correct at all times, * we need to recalculate here. */ for (uint64_t m = 0; m < oldc; m++) { metaslab_t *msp = vd->vdev_ms[m]; mutex_enter(&msp->ms_lock); metaslab_recalculate_weight_and_sort(msp); mutex_exit(&msp->ms_lock); } for (uint64_t m = oldc; m < newc; m++) { uint64_t object = 0; /* * vdev_ms_array may be 0 if we are creating the "fake" * metaslabs for an indirect vdev for zdb's leak detection. * See zdb_leak_init(). */ if (txg == 0 && vd->vdev_ms_array != 0) { error = dmu_read(spa->spa_meta_objset, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error != 0) { vdev_dbgmsg(vd, "unable to read the metaslab " "array [error=%d]", error); return (error); } } error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", error); return (error); } } /* * Find the emptiest metaslab on the vdev and mark it for use for * embedded slog by moving it from the regular to the log metaslab * group. This works for normal and special vdevs. */ if ((vd->vdev_mg->mg_class == spa_normal_class(spa) || vd->vdev_mg->mg_class == spa_special_class(spa)) && vd->vdev_ms_count > zfs_embedded_slog_min_ms && avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) { uint64_t slog_msid = 0; uint64_t smallest = UINT64_MAX; /* * Note, we only search the new metaslabs, because the old * (pre-existing) ones may be active (e.g. have non-empty * range_tree's), and we don't move them to the new * metaslab_t. */ for (uint64_t m = oldc; m < newc; m++) { uint64_t alloc = space_map_allocated(vd->vdev_ms[m]->ms_sm); if (alloc < smallest) { slog_msid = m; smallest = alloc; } } metaslab_t *slog_ms = vd->vdev_ms[slog_msid]; /* * The metaslab was marked as dirty at the end of * metaslab_init(). Remove it from the dirty list so that we * can uninitialize and reinitialize it to the new class. */ if (txg != 0) { (void) txg_list_remove_this(&vd->vdev_ms_list, slog_ms, txg); } uint64_t sm_obj = space_map_object(slog_ms->ms_sm); metaslab_fini(slog_ms); VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg, &vd->vdev_ms[slog_msid])); } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* * If the vdev is marked as non-allocating then don't * activate the metaslabs since we want to ensure that * no allocations are performed on this device. */ if (vd->vdev_noalloc) { /* track non-allocating vdev space */ vdev_update_nonallocating_space(vd, B_TRUE); } else if (!expanding) { metaslab_group_activate(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_group_activate(vd->vdev_log_mg); } if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); return (0); } void vdev_metaslab_fini(vdev_t *vd) { if (vd->vdev_checkpoint_sm != NULL) { ASSERT(spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_POOL_CHECKPOINT)); space_map_close(vd->vdev_checkpoint_sm); /* * Even though we close the space map, we need to set its * pointer to NULL. The reason is that vdev_metaslab_fini() * may be called multiple times for certain operations * (i.e. when destroying a pool) so we need to ensure that * this clause never executes twice. This logic is similar * to the one used for the vdev_ms clause below. */ vd->vdev_checkpoint_sm = NULL; } if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; metaslab_group_passivate(mg); if (vd->vdev_log_mg != NULL) { ASSERT(!vd->vdev_islog); metaslab_group_passivate(vd->vdev_log_mg); } uint64_t count = vd->vdev_ms_count; for (uint64_t m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) metaslab_fini(msp); } vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; vd->vdev_ms_count = 0; for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT0(mg->mg_histogram[i]); if (vd->vdev_log_mg != NULL) ASSERT0(vd->vdev_log_mg->mg_histogram[i]); } } ASSERT0(vd->vdev_ms_count); } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; boolean_t vps_zio_done_probe; int vps_flags; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; zio_link_t *zl; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u", vd->vdev_cant_read, vd->vdev_cant_write); if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); /* * If this probe was initiated from zio pipeline, then * change the state in a spa_async_request. Probes that * were initiated from a vdev_open can change the state * as part of the open call. * Skip fault injection if this vdev is already removed * or a removal is pending. */ if (vps->vps_zio_done_probe && !vd->vdev_remove_wanted && !vd->vdev_removed) { vd->vdev_fault_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_FAULT_VDEV); } } mutex_enter(&vd->vdev_probe_lock); ASSERT(vd->vdev_probe_zio == zio); vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* * Determine whether this device is accessible. * * Read and write to several known locations: the pad regions of each * vdev label but the first, which we leave alone in case it contains * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; vdev_probe_stats_t *vps = NULL; zio_t *pio; ASSERT(vd->vdev_ops->vdev_op_leaf); /* * Don't probe the probe. */ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) return (NULL); /* * To prevent 'probe storms' when a device fails, we create * just one probe i/o at a time. All zios that want to probe * this vdev will become parents of the probe io. */ mutex_enter(&vd->vdev_probe_lock); if ((pio = vd->vdev_probe_zio) == NULL) { vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; vps->vps_zio_done_probe = (zio != NULL); if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* * vdev_cant_read and vdev_cant_write can only * transition from TRUE to FALSE when we have the * SCL_ZIO lock as writer; otherwise they can only * transition from FALSE to TRUE. This ensures that * any zio looking at these values can assume that * failures persist for the life of the I/O. That's * important because when a device has intermittent * connectivity problems, we want to ensure that * they're ascribed to the device (ENXIO) and not * the zio (EIO). * * Since we hold SCL_ZIO as writer here, clear both * values so the probe can reevaluate from first * principles. */ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; } vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); } if (zio != NULL) zio_add_child(zio, pio); mutex_exit(&vd->vdev_probe_lock); if (vps == NULL) { ASSERT(zio != NULL); return (NULL); } for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } if (zio == NULL) return (pio); zio_nowait(pio); return (NULL); } static void vdev_load_child(void *arg) { vdev_t *vd = arg; vd->vdev_load_error = vdev_load(vd); } static void vdev_open_child(void *arg) { vdev_t *vd = arg; vd->vdev_open_thread = curthread; vd->vdev_open_error = vdev_open(vd); vd->vdev_open_thread = NULL; } static boolean_t vdev_uses_zvols(vdev_t *vd) { #ifdef _KERNEL if (zvol_is_zvol(vd->vdev_path)) return (B_TRUE); #endif for (int c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Returns B_TRUE if the passed child should be opened. */ static boolean_t vdev_default_open_children_func(vdev_t *vd) { (void) vd; return (B_TRUE); } /* * Open the requested child vdevs. If any of the leaf vdevs are using * a ZFS volume then do the opens in a single thread. This avoids a * deadlock when the current thread is holding the spa_namespace_lock. */ static void vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func) { int children = vd->vdev_children; taskq_t *tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); vd->vdev_nonrot = B_TRUE; for (int c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (open_func(cvd) == B_FALSE) continue; if (tq == NULL || vdev_uses_zvols(vd)) { cvd->vdev_open_error = vdev_open(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_open_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) taskq_wait(tq); for (int c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (open_func(cvd) == B_FALSE || cvd->vdev_state <= VDEV_STATE_FAULTED) continue; vd->vdev_nonrot &= cvd->vdev_nonrot; } if (tq != NULL) taskq_destroy(tq); } /* * Open all child vdevs. */ void vdev_open_children(vdev_t *vd) { vdev_open_children_impl(vd, vdev_default_open_children_func); } /* * Conditionally open a subset of child vdevs. */ void vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) { vdev_open_children_impl(vd, open_func); } /* * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17) * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE * changed, this algorithm can not change, otherwise it would inconsistently * account for existing bp's. We also hard-code txg 0 for the same reason * since expanded RAIDZ vdevs can use a different asize for different birth * txg's. */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >> SPA_MINBLOCKSHIFT); } } /* * Choose the best of two ashifts, preferring one between logical ashift * (absolute minimum) and administrator defined maximum, otherwise take * the biggest of the two. */ uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b) { if (a > logical && a <= zfs_vdev_max_auto_ashift) { if (b <= logical || b > zfs_vdev_max_auto_ashift) return (a); else return (MAX(a, b)); } else if (b <= logical || b > zfs_vdev_max_auto_ashift) return (MAX(a, b)); return (b); } /* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining * administrator defined limits and ensuring it doesn't go below the * logical ashift. */ static void vdev_ashift_optimize(vdev_t *vd) { ASSERT(vd == vd->vdev_top); if (vd->vdev_ashift < vd->vdev_physical_ashift && vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) { vd->vdev_ashift = MIN( MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), MAX(zfs_vdev_min_auto_ashift, vd->vdev_physical_ashift)); } else { /* * If the logical and physical ashifts are the same, then * we ensure that the top-level vdev's ashift is not smaller * than our minimum ashift value. For the unusual case * where logical ashift > physical ashift, we can't cap * the calculated ashift based on max ashift as that * would cause failures. * We still check if we need to increase it to match * the min ashift. */ vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift, vd->vdev_ashift); } } /* * Prepare a virtual device for access. */ int vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; uint64_t logical_ashift = 0; uint64_t physical_ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_fault_wanted = B_FALSE; vd->vdev_remove_wanted = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* * If this vdev is not removed, check its fault status. If it's * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT0(vd->vdev_children); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT0(vd->vdev_children); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &logical_ashift, &physical_ashift); /* Keep the device in removed state if unplugged */ if (error == ENOENT && vd->vdev_removed) { vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); return (error); } /* * Physical volume size should never be larger than its max size, unless * the disk has shrunk while we were reading it or the device is buggy * or damaged: either way it's not safe for use, bail out of the open. */ if (osize > max_osize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_OPEN_FAILED); return (SET_ERROR(ENXIO)); } /* * Reset the vdev_reopening flag so that we actually close * the vdev on error. */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO)); if (error) { if (vd->vdev_removed && vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, vd->vdev_stat.vs_aux); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); } return (error); } vd->vdev_removed = B_FALSE; /* * Recheck the faulted flag now that we have confirmed that * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { ASSERT0(vd->vdev_children); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { ASSERT0(vd->vdev_children); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } /* * For hole or missing vdevs we just return success. */ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } } osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t); max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); max_asize = max_osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; max_asize = max_osize; } /* * If the vdev was expanded, record this so that we can re-create the * uberblock rings in labels {2,3}, during the next sync. */ if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0)) vd->vdev_copy_uberblocks = B_TRUE; vd->vdev_psize = psize; /* * Make sure the allocatable size hasn't shrunk too much. */ if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EINVAL)); } /* * We can always set the logical/physical ashift members since * their values are only used to calculate the vdev_ashift when * the device is first added to the config. These values should * not be used for anything else since they may change whenever * the device is reopened and we don't store them in the label. */ vd->vdev_physical_ashift = MAX(physical_ashift, vd->vdev_physical_ashift); vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For compatibility, a different ashift can be requested. */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; /* * If the vdev_ashift was not overridden at creation time * (0) or the override value is impossible for the device, * then set it the logical ashift and optimize the ashift. */ if (vd->vdev_ashift < vd->vdev_logical_ashift) { vd->vdev_ashift = vd->vdev_logical_ashift; if (vd->vdev_logical_ashift > ASHIFT_MAX) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_ASHIFT_TOO_BIG); return (SET_ERROR(EDOM)); } if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE) vdev_ashift_optimize(vd); vd->vdev_attaching = B_FALSE; } if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || vd->vdev_ashift > ASHIFT_MAX)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_ASHIFT); return (SET_ERROR(EDOM)); } } else { /* * Make sure the alignment required hasn't increased. */ if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { (void) zfs_ereport_post( FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, spa, vd, NULL, NULL, 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EDOM)); } vd->vdev_max_asize = max_asize; } /* * If all children are healthy we update asize if either: * The asize has increased, due to a device expansion caused by dynamic * LUN growth or vdev replacement, and automatic expansion is enabled; * making the additional space available. * * The asize has decreased, due to a device shrink usually caused by a * vdev replace with a smaller device. This ensures that calculations * based of max_asize and asize e.g. esize are always valid. It's safe * to do this as we've already validated that asize is greater than * vdev_min_asize. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && ((asize > vd->vdev_asize && (vd->vdev_expanding || spa->spa_autoexpand)) || (asize < vd->vdev_asize))) vd->vdev_asize = asize; vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); return (error); } /* * Track the minimum allocation size. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && vd->vdev_islog == 0 && vd->vdev_aux == NULL) { uint64_t min_alloc = vdev_get_min_alloc(vd); vdev_spa_set_alloc(spa, min_alloc); } /* * If this is a leaf vdev, assess whether a resilver is needed. * But don't do this if we are doing a reopen for a scrub, since * this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen) dsl_scan_assess_vdev(spa->spa_dsl_pool, vd); return (0); } static void vdev_validate_child(void *arg) { vdev_t *vd = arg; vd->vdev_validate_thread = curthread; vd->vdev_validate_error = vdev_validate(vd); vd->vdev_validate_thread = NULL; } /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; taskq_t *tq = NULL; nvlist_t *label; uint64_t guid = 0, aux_guid = 0, top_guid; uint64_t state; nvlist_t *nvl; uint64_t txg; int children = vd->vdev_children; if (vdev_validate_skip) return (0); if (children > 0) { tq = taskq_create("vdev_validate", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } for (uint64_t c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { vdev_validate_child(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < children; c++) { int error = vd->vdev_child[c]->vdev_validate_error; if (error != 0) return (SET_ERROR(EBADF)); } /* * If the device has already failed, or was marked offline, don't do * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) return (0); /* * If we are performing an extreme rewind, we allow for a label that * was modified at a point after the current txg. * If config lock is not held do not check for the txg. spa_sync could * be updating the vdev's label before updating spa_last_synced_txg. */ if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) txg = UINT64_MAX; else txg = spa_last_synced_txg(spa); if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); vdev_dbgmsg(vd, "vdev_validate: failed reading config for " "txg %llu", (u_longlong_t)txg); return (0); } /* * Determine if this vdev has been split off into another * pool. If so, then refuse to open it. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, &aux_guid) == 0 && aux_guid == spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_SPLIT_POOL); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_GUID); return (0); } /* * If config is not trusted then ignore the spa guid check. This is * necessary because if the machine crashed during a re-guid the new * guid might have been written to all of the vdev labels, but not the * cached config. The check will be performed again once we have the * trusted config from the MOS. */ if (spa->spa_trust_config && guid != spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " "match config (%llu != %llu)", (u_longlong_t)guid, (u_longlong_t)spa_guid(spa)); return (0); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, &aux_guid) != 0) aux_guid = 0; if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_GUID); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_TOP_GUID); return (0); } /* * If this vdev just became a top-level vdev because its sibling was * detached, it will have adopted the parent's vdev guid -- but the * label may or may not be on disk yet. Fortunately, either version * of the label will have the same top guid, so if we're a top-level * vdev, we can safely compare to that instead. * However, if the config comes from a cachefile that failed to update * after the detach, a top-level vdev will appear as a non top-level * vdev in the config. Also relax the constraints if we perform an * extreme rewind. * * If we split this vdev off instead, then we also check the * original pool's guid. We don't want to consider the vdev * corrupt if it is partway through a split operation. */ if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { boolean_t mismatch = B_FALSE; if (spa->spa_trust_config && !spa->spa_extreme_rewind) { if (vd != vd->vdev_top || vd->vdev_guid != top_guid) mismatch = B_TRUE; } else { if (vd->vdev_guid != top_guid && vd->vdev_top->vdev_guid != guid) mismatch = B_TRUE; } if (mismatch) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: config guid " "doesn't match label guid"); vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)vd->vdev_top->vdev_guid); vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " "aux_guid %llu", (u_longlong_t)guid, (u_longlong_t)top_guid, (u_longlong_t)aux_guid); return (0); } } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_STATE); return (0); } nvlist_free(label); /* * If this is a verbatim import, no need to check the * state of the pool. */ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) { vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " "for spa %s", (u_longlong_t)state, spa->spa_name); return (SET_ERROR(EBADF)); } /* * If we were able to open and validate a vdev that was * previously marked permanently unavailable, clear that state * now. */ if (vd->vdev_not_present) vd->vdev_not_present = 0; return (0); } static void vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid) { if (svd != NULL && *dvd != NULL) { if (strcmp(svd, *dvd) != 0) { zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed " "from '%s' to '%s'", (u_longlong_t)guid, prefix, *dvd, svd); spa_strfree(*dvd); *dvd = spa_strdup(svd); } } else if (svd != NULL) { *dvd = spa_strdup(svd); zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", (u_longlong_t)guid, *dvd); } } static void vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) { char *old, *new; vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path, dvd->vdev_guid); vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid, dvd->vdev_guid); vdev_update_path("vdev_physpath", svd->vdev_physpath, &dvd->vdev_physpath, dvd->vdev_guid); /* * Our enclosure sysfs path may have changed between imports */ old = dvd->vdev_enc_sysfs_path; new = svd->vdev_enc_sysfs_path; if ((old != NULL && new == NULL) || (old == NULL && new != NULL) || ((old != NULL && new != NULL) && strcmp(new, old) != 0)) { zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path " "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, old, new); if (dvd->vdev_enc_sysfs_path) spa_strfree(dvd->vdev_enc_sysfs_path); if (svd->vdev_enc_sysfs_path) { dvd->vdev_enc_sysfs_path = spa_strdup( svd->vdev_enc_sysfs_path); } else { dvd->vdev_enc_sysfs_path = NULL; } } } /* * Recursively copy vdev paths from one vdev to another. Source and destination * vdev trees must have same geometry otherwise return error. Intended to copy * paths from userland config into MOS config. */ int vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) { if ((svd->vdev_ops == &vdev_missing_ops) || (svd->vdev_ishole && dvd->vdev_ishole) || (dvd->vdev_ops == &vdev_indirect_ops)) return (0); if (svd->vdev_ops != dvd->vdev_ops) { vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); return (SET_ERROR(EINVAL)); } if (svd->vdev_guid != dvd->vdev_guid) { vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " "%llu)", (u_longlong_t)svd->vdev_guid, (u_longlong_t)dvd->vdev_guid); return (SET_ERROR(EINVAL)); } if (svd->vdev_children != dvd->vdev_children) { vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " "%llu != %llu", (u_longlong_t)svd->vdev_children, (u_longlong_t)dvd->vdev_children); return (SET_ERROR(EINVAL)); } for (uint64_t i = 0; i < svd->vdev_children; i++) { int error = vdev_copy_path_strict(svd->vdev_child[i], dvd->vdev_child[i]); if (error != 0) return (error); } if (svd->vdev_ops->vdev_op_leaf) vdev_copy_path_impl(svd, dvd); return (0); } static void vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) { ASSERT(stvd->vdev_top == stvd); ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); for (uint64_t i = 0; i < dvd->vdev_children; i++) { vdev_copy_path_search(stvd, dvd->vdev_child[i]); } if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) return; /* * The idea here is that while a vdev can shift positions within * a top vdev (when replacing, attaching mirror, etc.) it cannot * step outside of it. */ vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) return; ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_copy_path_impl(vd, dvd); } /* * Recursively copy vdev paths from one root vdev to another. Source and * destination vdev trees may differ in geometry. For each destination leaf * vdev, search a vdev with the same guid and top vdev id in the source. * Intended to copy paths from userland config into MOS config. */ void vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) { uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); ASSERT(srvd->vdev_ops == &vdev_root_ops); ASSERT(drvd->vdev_ops == &vdev_root_ops); for (uint64_t i = 0; i < children; i++) { vdev_copy_path_search(srvd->vdev_child[i], drvd->vdev_child[i]); } } /* * Close a virtual device. */ void vdev_close(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; spa_t *spa __maybe_unused = vd->vdev_spa; ASSERT(vd != NULL); ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are * going offline. */ if (pvd != NULL && pvd->vdev_reopening) vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ vd->vdev_prevstate = vd->vdev_state; if (vd->vdev_offline) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void vdev_hold(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); if (spa->spa_state == POOL_STATE_UNINITIALIZED) return; for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL) vd->vdev_ops->vdev_op_hold(vd); } void vdev_rele(vdev_t *vd) { ASSERT(spa_is_root(vd->vdev_spa)); for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL) vd->vdev_ops->vdev_op_rele(vd); } /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock * on the spa_config_lock. Instead we only obtain the leaf's physical size. * If the leaf has never been opened then open it, as usual. */ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); /* * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). */ if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache) { /* * In case the vdev is present we should evict all ARC * buffers and pointers to log blocks and reclaim their * space before restoring its contents to L2ARC. */ if (l2arc_vdev_present(vd)) { l2arc_rebuild_vdev(vd, B_TRUE); } else { l2arc_add_vdev(spa, vd); } spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } else { (void) vdev_validate(vd); } /* * Recheck if resilver is still needed and cancel any * scheduled resilver if resilver is unneeded. */ if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) && spa->spa_async_tasks & SPA_ASYNC_RESILVER) { mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER; mutex_exit(&spa->spa_async_lock); } /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); } int vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) { int error; /* * Normally, partial opens (e.g. of a mirror) are allowed. * For a create, however, we want to fail the request if * there are any components we can't open. */ error = vdev_open(vd); if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); return (error ? error : SET_ERROR(ENXIO)); } /* * Recursively load DTLs and initialize all labels. */ if ((error = vdev_dtl_load(vd)) != 0 || (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } return (0); } void vdev_metaslab_set_size(vdev_t *vd) { uint64_t asize = vd->vdev_asize; uint64_t ms_count = asize >> zfs_vdev_default_ms_shift; uint64_t ms_shift; /* * There are two dimensions to the metaslab sizing calculation: * the size of the metaslab and the count of metaslabs per vdev. * * The default values used below are a good balance between memory * usage (larger metaslab size means more memory needed for loaded * metaslabs; more metaslabs means more memory needed for the * metaslab_t structs), metaslab load time (larger metaslabs take * longer to load), and metaslab sync time (more metaslabs means * more time spent syncing all of them). * * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs. * The range of the dimensions are as follows: * * 2^29 <= ms_size <= 2^34 * 16 <= ms_count <= 131,072 * * On the lower end of vdev sizes, we aim for metaslabs sizes of * at least 512MB (2^29) to minimize fragmentation effects when * testing with smaller devices. However, the count constraint * of at least 16 metaslabs will override this minimum size goal. * * On the upper end of vdev sizes, we aim for a maximum metaslab * size of 16GB. However, we will cap the total count to 2^17 * metaslabs to keep our memory footprint in check and let the * metaslab size grow from there if that limit is hit. * * The net effect of applying above constrains is summarized below. * * vdev size metaslab count * --------------|----------------- * < 8GB ~16 * 8GB - 100GB one per 512MB * 100GB - 3TB ~200 * 3TB - 2PB one per 16GB * > 2PB ~131,072 * -------------------------------- * * Finally, note that all of the above calculate the initial * number of metaslabs. Expanding a top-level vdev will result * in additional metaslabs being allocated making it possible * to exceed the zfs_vdev_ms_count_limit. */ if (ms_count < zfs_vdev_min_ms_count) ms_shift = highbit64(asize / zfs_vdev_min_ms_count); else if (ms_count > zfs_vdev_default_ms_count) ms_shift = highbit64(asize / zfs_vdev_default_ms_count); else ms_shift = zfs_vdev_default_ms_shift; if (ms_shift < SPA_MAXBLOCKSHIFT) { ms_shift = SPA_MAXBLOCKSHIFT; } else if (ms_shift > zfs_vdev_max_ms_shift) { ms_shift = zfs_vdev_max_ms_shift; /* cap the total count to constrain memory footprint */ if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); } vd->vdev_ms_shift = ms_shift; ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); /* indirect vdevs don't have metaslabs or dtls */ ASSERT(vdev_is_concrete(vd) || flags == 0); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); if (flags & VDD_DTL) (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { for (int c = 0; c < vd->vdev_children; c++) vdev_dirty_leaves(vd->vdev_child[c], flags, txg); if (vd->vdev_ops->vdev_op_leaf) vdev_dirty(vd->vdev_top, flags, vd, txg); } /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * * DTL_PARTIAL: txgs for which data is available, but not fully replicated * * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of * txgs that was scrubbed. * * DTL_OUTAGE: txgs which cannot currently be read, whether due to * persistent errors or just some device being offline. * Unlike the other three, the DTL_OUTAGE map is not generally * maintained; it's only computed when needed, typically to * determine whether a device can be detached. * * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device * either has the data or it doesn't. * * For interior vdevs such as mirror and RAID-Z the picture is more complex. * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because * if any child is less than fully replicated, then so is its parent. * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, * comprising only those txgs which appear in 'maxfaults' or more children; * those are the txgs we don't have enough replication to read. For example, * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); * thus, its DTL_MISSING consists of the set of txgs that appear in more than * two child DTL_MISSING maps. * * It should be clear from the above that to compute the DTLs and outage maps * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. * Therefore, that is all we keep on disk. When loading the pool, or after * a configuration change, we generate all other DTLs from first principles. */ void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { zfs_range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(&vd->vdev_dtl_lock); if (!zfs_range_tree_contains(rt, txg, size)) zfs_range_tree_add(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { zfs_range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); /* * While we are loading the pool, the DTLs have not been loaded yet. * This isn't a problem but it can result in devices being tried * which are known to not have the data. In which case, the import * is relying on the checksum to ensure that we get the right data. * Note that while importing we are only reading the MOS, which is * always checksummed. */ mutex_enter(&vd->vdev_dtl_lock); if (!zfs_range_tree_is_empty(rt)) dirty = zfs_range_tree_contains(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); return (dirty); } boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { zfs_range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(&vd->vdev_dtl_lock); empty = zfs_range_tree_is_empty(rt); mutex_exit(&vd->vdev_dtl_lock); return (empty); } /* * Check if the txg falls within the range which must be * resilvered. DVAs outside this range can always be skipped. */ boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { (void) dva, (void) psize; /* Set by sequential resilver. */ if (phys_birth == TXG_UNKNOWN) return (B_TRUE); return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)); } /* * Returns B_TRUE if the vdev determines the DVA needs to be resilvered. */ boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); if (vd->vdev_ops->vdev_op_need_resilver == NULL || vd->vdev_ops->vdev_op_leaf) return (B_TRUE); return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize, phys_birth)); } /* * Returns the lowest txg in the DTL range. */ static uint64_t vdev_dtl_min(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (zfs_range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); } /* * Returns the highest txg in the DTL. */ static uint64_t vdev_dtl_max(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (zfs_range_tree_max(vd->vdev_dtl[DTL_MISSING])); } /* * Determine if a resilvering vdev should remove any DTL entries from * its range. If the vdev was resilvering for the entire duration of the * scan then it should excise that range from its DTLs. Otherwise, this * vdev is considered partially resilvered and should leave its DTL * entries intact. The comment in vdev_dtl_reassess() describes how we * excise the DTLs. */ static boolean_t vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) { ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) return (B_FALSE); if (vd->vdev_resilver_deferred) return (B_FALSE); if (zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); if (rebuild_done) { vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; /* Rebuild not initiated by attach */ if (vd->vdev_rebuild_txg == 0) return (B_TRUE); /* * When a rebuild completes without error then all missing data * up to the rebuild max txg has been reconstructed and the DTL * is eligible for excision. */ if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE && vdev_dtl_max(vd) <= vrp->vrp_max_txg) { ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg); ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg); return (B_TRUE); } } else { dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys; /* Resilver not initiated by attach */ if (vd->vdev_resilver_txg == 0) return (B_TRUE); /* * When a resilver is initiated the scan will assign the * scn_max_txg value to the highest txg value that exists * in all DTLs. If this device's max DTL is not part of this * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg] * then it is not eligible for excision. */ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg); ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg); return (B_TRUE); } } return (B_FALSE); } /* * Reassess DTLs after a config change or scrub completion. If txg == 0 no * write operations will be issued to the pool. */ static void vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done, boolean_t faulting) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; int minref; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess_impl(vd->vdev_child[c], txg, scrub_txg, scrub_done, rebuild_done, faulting); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; boolean_t check_excise = B_FALSE; boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); /* * If requested, pretend the scan or rebuild completed cleanly. */ if (zfs_scan_ignore_errors) { if (scn != NULL) scn->scn_phys.scn_errors = 0; if (vr != NULL) vr->vr_rebuild_phys.vrp_errors = 0; } if (scrub_txg != 0 && !zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { wasempty = B_FALSE; zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " "dtl:%llu/%llu errors:%llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg, (u_longlong_t)scrub_txg, spa->spa_scrub_started, (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd), (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0)); } /* * If we've completed a scrub/resilver or a rebuild cleanly * then determine if this vdev should remove any DTLs. We * only want to excise regions on vdevs that were available * during the entire duration of this scan. */ if (rebuild_done && vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) { check_excise = B_TRUE; } else { if (spa->spa_scrub_started || (scn != NULL && scn->scn_phys.scn_errors == 0)) { check_excise = B_TRUE; } } if (scrub_txg && check_excise && vdev_dtl_should_excise(vd, rebuild_done)) { /* * We completed a scrub, resilver or rebuild up to * scrub_txg. If we did it without rebooting, then * the scrub dtl will be valid, so excise the old * region and fold in the scrub dtl. Otherwise, * leave the dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference * tree and then add a segment with refcnt -1 that * covers the range [0, scrub_txg). This means * that each txg in that range has refcnt -1 or 0. * We then add DTL_SCRUB with a refcnt of 2, so that * entries in the range [0, scrub_txg) will have a * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ space_reftree_create(&reftree); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_add_seg(&reftree, 0, scrub_txg, -1); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_SCRUB], 2); space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); if (!zfs_range_tree_is_empty( vd->vdev_dtl[DTL_MISSING])) { zfs_dbgmsg("update DTL_MISSING:%llu/%llu", (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd)); } else if (!wasempty) { zfs_dbgmsg("DTL_MISSING is now empty"); } } zfs_range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING], zfs_range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) zfs_range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); zfs_range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); /* * For the faulting case, treat members of a replacing vdev * as if they are not available. It's more likely than not that * a vdev in a replacing vdev could encounter read errors so * treat it as not being able to contribute. */ if (!vdev_readable(vd) || (faulting && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_replacing_ops)) { zfs_range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); } else { zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING], zfs_range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); } /* * If the vdev was resilvering or rebuilding and no longer * has any DTLs then reset the appropriate flag and dirty * the top level so that we persist the change. */ if (txg != 0 && zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && zfs_range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { if (vd->vdev_rebuild_txg != 0) { vd->vdev_rebuild_txg = 0; vdev_config_dirty(vd->vdev_top); } else if (vd->vdev_resilver_txg != 0) { vd->vdev_resilver_txg = 0; vdev_config_dirty(vd->vdev_top); } } mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); } else { mutex_enter(&vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) { /* account for child's outage in parent's missing map */ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) { /* leaf vdevs only */ continue; } if (t == DTL_PARTIAL) { /* i.e. non-zero */ minref = 1; } else if (vdev_get_nparity(vd) != 0) { /* RAIDZ, DRAID */ minref = vdev_get_nparity(vd) + 1; } else { /* any kind of mirror */ minref = vd->vdev_children; } space_reftree_create(&reftree); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) { raidz_dtl_reassessed(vd); } } void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done) { return (vdev_dtl_reassess_impl(vd, txg, scrub_txg, scrub_done, rebuild_done, B_FALSE)); } /* * Iterate over all the vdevs except spare, and post kobj events */ void vdev_post_kobj_evt(vdev_t *vd) { if (vd->vdev_ops->vdev_op_kobj_evt_post && vd->vdev_kobj_flag == B_FALSE) { vd->vdev_kobj_flag = B_TRUE; vd->vdev_ops->vdev_op_kobj_evt_post(vd); } for (int c = 0; c < vd->vdev_children; c++) vdev_post_kobj_evt(vd->vdev_child[c]); } /* * Iterate over all the vdevs except spare, and clear kobj events */ void vdev_clear_kobj_evt(vdev_t *vd) { vd->vdev_kobj_flag = B_FALSE; for (int c = 0; c < vd->vdev_children; c++) vdev_clear_kobj_evt(vd->vdev_child[c]); } int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; zfs_range_tree_t *rt; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(vdev_is_concrete(vd)); /* * If the dtl cannot be sync'd there is no need to open it. */ if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps) return (0); error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); rt = zfs_range_tree_create_flags( NULL, ZFS_RANGE_SEG64, NULL, 0, 0, ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_dtl_load:rt")); error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC); if (error == 0) { mutex_enter(&vd->vdev_dtl_lock); zfs_range_tree_walk(rt, zfs_range_tree_add, vd->vdev_dtl[DTL_MISSING]); mutex_exit(&vd->vdev_dtl_lock); } zfs_range_tree_vacate(rt, NULL, NULL); zfs_range_tree_destroy(rt); return (error); } for (int c = 0; c < vd->vdev_children; c++) { error = vdev_dtl_load(vd->vdev_child[c]); if (error != 0) break; } return (error); } static void vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; const char *string; ASSERT(alloc_bias != VDEV_BIAS_NONE); string = (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; ASSERT(string != NULL); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, strlen(string) + 1, string, tx)); if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { spa_activate_allocation_classes(spa, tx); } } void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zapobj, tx)); } uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); ASSERT(zap != 0); VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zap, tx)); return (zap); } void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ops != &vdev_hole_ops && vd->vdev_ops != &vdev_missing_ops && vd->vdev_ops != &vdev_root_ops && !vd->vdev_top->vdev_removing) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) vdev_zap_allocation_data(vd, tx); } } if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 && spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) { if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx); vd->vdev_root_zap = vdev_create_link_zap(vd, tx); } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } } static void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; zfs_range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; zfs_range_tree_t *rtsync; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached || vd->vdev_top->vdev_removing) { mutex_enter(&vd->vdev_dtl_lock); space_map_free(vd->vdev_dtl_sm, tx); space_map_close(vd->vdev_dtl_sm); vd->vdev_dtl_sm = NULL; mutex_exit(&vd->vdev_dtl_lock); /* * We only destroy the leaf ZAP for detached leaves or for * removed log devices. Removed data devices handle leaf ZAP * cleanup later, once cancellation is no longer possible. */ if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || vd->vdev_top->vdev_islog)) { vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); vd->vdev_leaf_zap = 0; } dmu_tx_commit(tx); return; } if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 0, -1ULL, 0)); ASSERT(vd->vdev_dtl_sm != NULL); } rtsync = zfs_range_tree_create_flags(NULL, ZFS_RANGE_SEG64, NULL, 0, 0, ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "rtsync")); mutex_enter(&vd->vdev_dtl_lock); zfs_range_tree_walk(rt, zfs_range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); zfs_range_tree_vacate(rtsync, NULL, NULL); zfs_range_tree_destroy(rtsync); /* * If the object for the space map has changed then dirty * the top level so that we update the config. */ if (object != space_map_object(vd->vdev_dtl_sm)) { vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " "new object %llu", (u_longlong_t)txg, spa_name(spa), (u_longlong_t)object, (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); vdev_config_dirty(vd->vdev_top); } dmu_tx_commit(tx); } /* * Determine whether the specified vdev can be * - offlined * - detached * - removed * - faulted * without losing data. */ boolean_t vdev_dtl_required(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; boolean_t faulting = vd->vdev_state == VDEV_STATE_FAULTED; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == spa->spa_root_vdev || vd == tvd) return (B_TRUE); /* * Temporarily mark the device as unreadable, and then determine * whether this results in any DTL outages in the top-level vdev. * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting); if (!required && zio_injection_enabled) { required = !!zio_handle_device_injection(vd, NULL, SET_ERROR(ECHILD)); } return (required); } /* * Determine if resilver is needed, and if so the txg range. */ boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) { boolean_t needed = B_FALSE; uint64_t thismin = UINT64_MAX; uint64_t thismax = 0; if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); if (!zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; if (vdev_resilver_needed(cvd, &cmin, &cmax)) { thismin = MIN(thismin, cmin); thismax = MAX(thismax, cmax); needed = B_TRUE; } } } if (needed && minp) { *minp = thismin; *maxp = thismax; } return (needed); } /* * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj * will contain either the checkpoint spacemap object or zero if none exists. * All other errors are returned to the caller. */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *sm_obj = 0; return (0); } int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj); if (error == ENOENT) { *sm_obj = 0; error = 0; } return (error); } int vdev_load(vdev_t *vd) { int children = vd->vdev_children; int error = 0; taskq_t *tq = NULL; /* * It's only worthwhile to use the taskq for the root vdev, because the * slow part is metaslab_init, and that only happens for top-level * vdevs. */ if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) { tq = taskq_create("vdev_load", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { cvd->vdev_load_error = vdev_load(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_load_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < vd->vdev_children; c++) { int error = vd->vdev_child[c]->vdev_load_error; if (error != 0) return (error); } vdev_set_deflate_ratio(vd); if (vd->vdev_ops == &vdev_raidz_ops) { error = vdev_raidz_load(vd); if (error != 0) return (error); } /* * On spa_load path, grab the allocation bias from our zap */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; char bias_str[64]; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), bias_str); if (error == 0) { ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); } else if (error != ENOENT) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", (u_longlong_t)vd->vdev_top_zap, error); return (error); } } if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; uint64_t failfast; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast), 1, &failfast); if (error == 0) { vd->vdev_failfast = failfast & 1; } else if (error == ENOENT) { vd->vdev_failfast = vdev_prop_default_numeric( VDEV_PROP_FAILFAST); } else { vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", (u_longlong_t)vd->vdev_top_zap, error); } } if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; uint64_t autosit; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, vdev_prop_to_name(VDEV_PROP_AUTOSIT), sizeof (autosit), 1, &autosit); if (error == 0) { vd->vdev_autosit = autosit == 1; } else if (error == ENOENT) { vd->vdev_autosit = vdev_prop_default_numeric( VDEV_PROP_AUTOSIT); } else { vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", (u_longlong_t)vd->vdev_top_zap, error); } } /* * Load any rebuild state from the top-level vdev zap. */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { error = vdev_rebuild_load(vd); if (error && error != ENOTSUP) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load " "failed [error=%d]", error); return (error); } } if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) { uint64_t zapobj; if (vd->vdev_top_zap != 0) zapobj = vd->vdev_top_zap; else zapobj = vd->vdev_leaf_zap; error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N, &vd->vdev_checksum_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T, &vd->vdev_checksum_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_IO_N, &vd->vdev_io_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_IO_T, &vd->vdev_io_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N, &vd->vdev_slow_io_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T, &vd->vdev_slow_io_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); } /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " "asize=%llu", (u_longlong_t)vd->vdev_ashift, (u_longlong_t)vd->vdev_asize); return (SET_ERROR(ENXIO)); } error = vdev_metaslab_init(vd, 0); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " "[error=%d]", error); vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (error); } uint64_t checkpoint_sm_obj; error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj); if (error == 0 && checkpoint_sm_obj != 0) { objset_t *mos = spa_meta_objset(vd->vdev_spa); ASSERT(vd->vdev_asize != 0); ASSERT0P(vd->vdev_checkpoint_sm); error = space_map_open(&vd->vdev_checkpoint_sm, mos, checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: space_map_open " "failed for checkpoint spacemap (obj %llu) " "[error=%d]", (u_longlong_t)checkpoint_sm_obj, error); return (error); } ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since the checkpoint_sm contains free entries * exclusively we can use space_map_allocated() to * indicate the cumulative checkpointed space that * has been freed. */ vd->vdev_stat.vs_checkpoint_space = -space_map_allocated(vd->vdev_checkpoint_sm); vd->vdev_spa->spa_checkpoint_info.sci_dspace += vd->vdev_stat.vs_checkpoint_space; } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve " "checkpoint space map object from vdev ZAP " "[error=%d]", error); return (error); } } /* * If this is a leaf vdev, load its DTL. */ if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " "[error=%d]", error); return (error); } uint64_t obsolete_sm_object; error = vdev_obsolete_sm_object(vd, &obsolete_sm_object); if (error == 0 && obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; ASSERT(vd->vdev_asize != 0); ASSERT0P(vd->vdev_obsolete_sm); if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, obsolete_sm_object, 0, vd->vdev_asize, 0))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " "obsolete spacemap (obj %llu) [error=%d]", (u_longlong_t)obsolete_sm_object, error); return (error); } } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete " "space map object from vdev ZAP [error=%d]", error); return (error); } return (0); } /* * The special vdev case is used for hot spares and l2cache devices. Its * sole purpose it to set the vdev state for the associated vdev. To do this, * we make sure that we can open the underlying device, then try to read the * label, and make sure that the label is sane and that it hasn't been * repurposed to another pool. */ int vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; if (!vdev_readable(vd)) return (0); if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (-1); } /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. */ nvlist_free(label); return (0); } static void vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx) { objset_t *mos = spa_meta_objset(vd->vdev_spa); if (vd->vdev_top_zap == 0) return; uint64_t object = 0; int err = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); if (err == ENOENT) return; VERIFY0(err); VERIFY0(dmu_object_free(mos, object, tx)); VERIFY0(zap_remove(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx)); } /* * Free the objects used to store this vdev's spacemaps, and the array * that points to them. */ void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ms_array == 0) return; objset_t *mos = vd->vdev_spa->spa_meta_objset; uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; size_t array_bytes = array_count * sizeof (uint64_t); uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, array_bytes, smobj_array, 0)); for (uint64_t i = 0; i < array_count; i++) { uint64_t smobj = smobj_array[i]; if (smobj == 0) continue; space_map_free_obj(mos, smobj, tx); } kmem_free(smobj_array, array_bytes); VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); vdev_destroy_ms_flush_data(vd, tx); vd->vdev_ms_array = 0; } static void vdev_remove_empty_log(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); vdev_destroy_spacemaps(vd, tx); if (vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); vd->vdev_top_zap = 0; } dmu_tx_commit(tx); } void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(vdev_is_concrete(vd)); while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) != NULL) metaslab_sync_done(msp, txg); if (reassess) { metaslab_sync_reassess(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_sync_reassess(vd->vdev_log_mg); } } void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; ASSERT3U(txg, ==, spa->spa_syncing_txg); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (zfs_range_tree_space(vd->vdev_obsolete_segments) > 0) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); vdev_indirect_sync_obsolete(vd, tx); /* * If the vdev is indirect, it can't have dirty * metaslabs or DTLs. */ if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); dmu_tx_commit(tx); return; } } ASSERT(vdev_is_concrete(vd)); if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && !vd->vdev_removing) { ASSERT(vd == vd->vdev_top); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); } while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); /* * If this is an empty log device being removed, destroy the * metadata associated with it. */ if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove_empty_log(vd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); dmu_tx_commit(tx); } uint64_t vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize, uint64_t txg) { return (vd->vdev_ops->vdev_op_asize_to_psize(vd, asize, txg)); } /* * Return the amount of space that should be (or was) allocated for the given * psize (compressed block size) in the given TXG. Note that for expanded * RAIDZ vdevs, the size allocated for older BP's may be larger. See * vdev_raidz_psize_to_asize(). */ uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg) { return (vd->vdev_ops->vdev_op_psize_to_asize(vd, psize, txg)); } uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { return (vdev_psize_to_asize_txg(vd, psize, 0)); } /* * Mark the given vdev faulted. A faulted vdev behaves as if the device could * not be opened, and no I/O is attempted. */ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); tvd = vd->vdev_top; /* * If user did a 'zpool offline -f' then make the fault persist across * reboots. */ if (aux == VDEV_AUX_EXTERNAL_PERSIST) { /* * There are two kinds of forced faults: temporary and * persistent. Temporary faults go away at pool import, while * persistent faults stay set. Both types of faults can be * cleared with a zpool clear. * * We tell if a vdev is persistently faulted by looking at the * ZPOOL_CONFIG_AUX_STATE nvpair. If it's set to "external" at * import then it's a persistent fault. Otherwise, it's * temporary. We get ZPOOL_CONFIG_AUX_STATE set to "external" * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL. This * tells vdev_config_generate() (which gets run later) to set * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist. */ vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_tmpoffline = B_FALSE; aux = VDEV_AUX_EXTERNAL; } else { vd->vdev_tmpoffline = B_TRUE; } /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we * were faulted. */ vd->vdev_label_aux = aux; /* * Faulted state takes precedence over degraded. */ vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; /* * If we reopen the device and it's not dead, only then do we * mark it degraded. */ vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); } /* * Mark the given vdev degraded. A degraded vdev is purely an indication to the * user that something is wrong. The vdev continues to operate as normal as far * as I/O is concerned. */ int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); /* * If the vdev is already faulted, then don't do anything. */ if (vd->vdev_faulted || vd->vdev_degraded) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_remove_wanted(spa_t *spa, uint64_t guid) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); /* * If the vdev is already removed, or expanding which can trigger * repartition add/remove events, then don't do anything. */ if (vd->vdev_removed || vd->vdev_expanding) return (spa_vdev_state_exit(spa, NULL, 0)); /* * Confirm the vdev has been removed, otherwise don't do anything. */ if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL))) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST))); vd->vdev_remove_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_REMOVE_BY_USER); return (spa_vdev_state_exit(spa, vd, 0)); } /* * Online the given vdev. * * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached * spare device should be detached when the device finishes resilvering. * Second, the online should be treated like a 'test' online case, so no FMA * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; boolean_t wasoffline; vdev_state_t oldstate; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand); vd->vdev_expansion_time = gethrestime_sec(); } vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = B_FALSE; } if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && !vdev_is_dead(vd) && vd->vdev_parent && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa->spa_ccw_fail_time = 0; spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } /* Restart initializing if necessary */ mutex_enter(&vd->vdev_initialize_lock); if (vdev_writeable(vd) && vd->vdev_initialize_thread == NULL && vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) vdev_initialize(vd); } mutex_exit(&vd->vdev_initialize_lock); /* * Restart trimming if necessary. We do not restart trimming for cache * devices here. This is triggered by l2arc_rebuild_vdev() * asynchronously for the whole device or in l2arc_evict() as it evicts * space for upcoming writes. */ mutex_enter(&vd->vdev_trim_lock); if (vdev_writeable(vd) && !vd->vdev_isl2cache && vd->vdev_trim_thread == NULL && vd->vdev_trim_state == VDEV_TRIM_ACTIVE) { (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial, vd->vdev_trim_secure); } mutex_exit(&vd->vdev_trim_lock); if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) { spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); /* * Asynchronously detach spare vdev if resilver or * rebuild is not required */ if (vd->vdev_unspare && !dsl_scan_resilvering(spa->spa_dsl_pool) && !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) && !vdev_rebuild_active(tvd)) spa_async_request(spa, SPA_ASYNC_DETACH_SPARE); } return (spa_vdev_state_exit(spa, vd, 0)); } static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; uint64_t generation; metaslab_group_t *mg; top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); if (vd->vdev_ops == &vdev_draid_spare_ops) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, * don't allow it to be offlined. Log devices are always * expendable. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); /* * If the top-level is a slog and it has had allocations * then proceed. We check that the vdev's metaslab group * is not NULL since it's possible that we may have just * added this vdev but not yet initialized its metaslabs. */ if (tvd->vdev_islog && mg != NULL) { /* * Prevent any future allocations. */ ASSERT0P(tvd->vdev_log_mg); metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = spa_reset_logs(spa); /* * If the log device was successfully reset but has * checkpointed data, do not offline it. */ if (error == 0 && tvd->vdev_checkpoint_sm != NULL) { ASSERT3U(space_map_allocated( tvd->vdev_checkpoint_sm), !=, 0); error = ZFS_ERR_CHECKPOINT_EXISTS; } spa_vdev_state_enter(spa, SCL_ALLOC); /* * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); (void) spa_vdev_state_exit(spa, vd, 0); goto top; } ASSERT0(tvd->vdev_stat.vs_alloc); } /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; vdev_reopen(tvd); if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); } /* * Add the device back into the metaslab rotor so that * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { int error; mutex_enter(&spa->spa_vdev_top_lock); error = vdev_offline_locked(spa, guid, flags); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. */ void vdev_clear(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_dio_verify_errors = 0; vd->vdev_stat.vs_slow_ios = 0; atomic_store_64(&vd->vdev_outlier_count, 0); vd->vdev_read_sit_out_expire = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); /* * It makes no sense to "clear" an indirect or removed vdev. */ if (!vdev_is_concrete(vd) || vd->vdev_removed) return; /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is * written out to disk. */ if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { /* * When reopening in response to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ vd->vdev_forcefault = B_TRUE; vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_stat.vs_aux = 0; vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); /* If a resilver isn't required, check if vdevs can be culled */ if (vd->vdev_aux == NULL && !vdev_is_dead(vd) && !dsl_scan_resilvering(spa->spa_dsl_pool) && !dsl_scan_resilver_scheduled(spa->spa_dsl_pool)) spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } /* * When clearing a FMA-diagnosed fault, we always want to * unspare the device, as we assume that the original spare was * done in response to the FMA fault. */ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; /* Clear recent error events cache (i.e. duplicate events tracking) */ zfs_ereport_clear(spa, vd); } boolean_t vdev_is_dead(vdev_t *vd) { /* * Holes and missing devices are always considered "dead". * This simplifies the code since we don't have to check for * these types of devices in the various code paths. * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ops == &vdev_hole_ops || vd->vdev_ops == &vdev_missing_ops); } boolean_t vdev_readable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_read); } boolean_t vdev_writeable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_write && vdev_is_concrete(vd)); } boolean_t vdev_allocatable(vdev_t *vd) { uint64_t state = vd->vdev_state; /* * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding * the proper locks. Note that we have to get the vdev state * in a local variable because although it changes atomically, * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write && vdev_is_concrete(vd) && vd->vdev_mg->mg_initialized); } boolean_t vdev_accessible(vdev_t *vd, zio_t *zio) { ASSERT(zio->io_vd == vd); if (vdev_is_dead(vd) || vd->vdev_remove_wanted) return (B_FALSE); if (zio->io_type == ZIO_TYPE_READ) return (!vd->vdev_cant_read); if (zio->io_type == ZIO_TYPE_WRITE) return (!vd->vdev_cant_write); return (B_TRUE); } static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { /* * Exclude the dRAID spare when aggregating to avoid double counting * the ops and bytes. These IOs are counted by the physical leaves. */ if (cvd->vdev_ops == &vdev_draid_spare_ops) return; for (int t = 0; t < VS_ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; } /* * Get extended stats */ static void vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) { (void) cvd; int t, b; for (t = 0; t < ZIO_TYPES; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++) vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) { vsx->vsx_total_histo[t][b] += cvsx->vsx_total_histo[t][b]; } } for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) { vsx->vsx_queue_histo[t][b] += cvsx->vsx_queue_histo[t][b]; } vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t]; vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++) vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++) vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b]; } } boolean_t vdev_is_spacemap_addressable(vdev_t *vd) { if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2)) return (B_TRUE); /* * If double-word space map entries are not enabled we assume * 47 bits of the space map entry are dedicated to the entry's * offset (see SM_OFFSET_BITS in space_map.h). We then use that * to calculate the maximum address that can be described by a * space map entry for the given device. */ uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS; if (shift >= 63) /* detect potential overflow */ return (B_TRUE); return (vd->vdev_asize < (1ULL << shift)); } /* * Get statistics for the given vdev. */ static void vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { int t; /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ if (!vd->vdev_ops->vdev_op_leaf) { if (vs) { memset(vs->vs_ops, 0, sizeof (vs->vs_ops)); memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes)); } if (vsx) memset(vsx, 0, sizeof (*vsx)); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex; vdev_get_stats_ex_impl(cvd, cvs, cvsx); if (vs) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); } } else { /* * We're a leaf. Just copy our ZIO active queue stats in. The * other leaf stats are updated in vdev_stat_update(). */ if (!vsx) return; memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t]; vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t); } } } void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { vdev_t *tvd = vd->vdev_top; mutex_enter(&vd->vdev_stat_lock); if (vs) { memcpy(vs, &vd->vdev_stat, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) { vs->vs_pspace = vd->vdev_psize; vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; /* * Report initializing progress. Since we don't * have the initializing locks held, this is only * an estimate (although a fairly accurate one). */ vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; vs->vs_initialize_state = vd->vdev_initialize_state; vs->vs_initialize_action_time = vd->vdev_initialize_action_time; /* * Report manual TRIM progress. Since we don't have * the manual TRIM locks held, this is only an * estimate (although fairly accurate one). */ vs->vs_trim_notsup = !vd->vdev_has_trim; vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done; vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; vs->vs_trim_state = vd->vdev_trim_state; vs->vs_trim_action_time = vd->vdev_trim_action_time; /* Set when there is a deferred resilver. */ vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } /* * Report expandable space on top-level, non-auxiliary devices * only. The expandable space is reported in terms of metaslab * sized units since that determines how much space the pool * can expand. */ if (vd->vdev_aux == NULL && tvd != NULL) { vs->vs_esize = P2ALIGN_TYPED( vd->vdev_max_asize - vd->vdev_asize, 1ULL << tvd->vdev_ms_shift, uint64_t); } vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; if (vd->vdev_physical_ashift <= ASHIFT_MAX) vs->vs_physical_ashift = vd->vdev_physical_ashift; else vs->vs_physical_ashift = 0; /* * Report fragmentation and rebuild progress for top-level, * non-auxiliary, concrete devices. */ if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { /* * The vdev fragmentation rating doesn't take into * account the embedded slog metaslab (vdev_log_mg). * Since it's only one metaslab, it would have a tiny * impact on the overall fragmentation. */ vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } vs->vs_noalloc = MAX(vd->vdev_noalloc, tvd ? tvd->vdev_noalloc : 0); } vdev_get_stats_ex_impl(vd, vs, vsx); mutex_exit(&vd->vdev_stat_lock); } void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { return (vdev_get_stats_ex(vd, vs, NULL)); } void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_space = 0; vd->vdev_stat.vs_dspace = 0; vd->vdev_stat.vs_alloc = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_scan_stat_init(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (int c = 0; c < vd->vdev_children; c++) vdev_scan_stat_init(vd->vdev_child[c]); mutex_enter(&vd->vdev_stat_lock); vs->vs_scan_processed = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; /* Suppress ASAN false positive */ #ifdef __SANITIZE_ADDRESS__ vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL; vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL; #else vdev_stat_t *vs = &vd->vdev_stat; vdev_stat_ex_t *vsx = &vd->vdev_stat_ex; #endif zio_type_t type = zio->io_type; int flags = zio->io_flags; /* * If this i/o is a gang leader, it didn't do any actual work. */ if (zio->io_gang_tree) return; if (zio->io_error == 0) { /* * If this is a root i/o, don't count it -- we've already * counted the top-level vdevs, and vdev_get_stats() will * aggregate them when asked. This reduces contention on * the root vdev_stat_lock and implicitly handles blocks * that compress away to holes, for which there is no i/o. * (Holes never create vdev children, so all the counters * remain zero, which is what we want.) * * Note: this only applies to successful i/o (io_error == 0) * because unlike i/o counts, errors are not additive. * When reading a ditto block, for example, failure of * one top-level vdev does not imply a root-level error. */ if (vd == rvd) return; ASSERT(vd == zio->io_vd); if (flags & ZIO_FLAG_IO_BYPASS) return; mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { /* * Repair is the result of a resilver issued by the * scan thread (spa_sync). */ if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scn_phys = &scn->scn_phys; uint64_t *processed = &scn_phys->scn_processed; if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } /* * Repair is the result of a rebuild issued by the * rebuild thread (vdev_rebuild_thread). To avoid * double counting repaired bytes the virtual dRAID * spare vdev is excluded from the processed bytes. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD) { vdev_t *tvd = vd->vdev_top; vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { atomic_add_64(rebuilt, psize); } vs->vs_rebuild_processed += psize; } if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } /* * The bytes/ops/histograms are recorded at the leaf level and * aggregated into the higher level vdevs in vdev_get_stats(). */ if (vd->vdev_ops->vdev_op_leaf && (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { zio_type_t vs_type = type; zio_priority_t priority = zio->io_priority; /* * TRIM ops and bytes are reported to user space as * ZIO_TYPE_FLUSH. This is done to preserve the * vdev_stat_t structure layout for user space. */ if (type == ZIO_TYPE_TRIM) vs_type = ZIO_TYPE_FLUSH; /* * Solely for the purposes of 'zpool iostat -lqrw' * reporting use the priority to categorize the IO. * Only the following are reported to user space: * * ZIO_PRIORITY_SYNC_READ, * ZIO_PRIORITY_SYNC_WRITE, * ZIO_PRIORITY_ASYNC_READ, * ZIO_PRIORITY_ASYNC_WRITE, * ZIO_PRIORITY_SCRUB, * ZIO_PRIORITY_TRIM, * ZIO_PRIORITY_REBUILD. */ if (priority == ZIO_PRIORITY_INITIALIZING) { ASSERT3U(type, ==, ZIO_TYPE_WRITE); priority = ZIO_PRIORITY_ASYNC_WRITE; } else if (priority == ZIO_PRIORITY_REMOVAL) { priority = ((type == ZIO_TYPE_WRITE) ? ZIO_PRIORITY_ASYNC_WRITE : ZIO_PRIORITY_ASYNC_READ); } vs->vs_ops[vs_type]++; vs->vs_bytes[vs_type] += psize; if (flags & ZIO_FLAG_DELEGATED) { vsx->vsx_agg_histo[priority] [RQ_HISTO(zio->io_size)]++; } else { vsx->vsx_ind_histo[priority] [RQ_HISTO(zio->io_size)]++; } if (zio->io_delta && zio->io_delay) { vsx->vsx_queue_histo[priority] [L_HISTO(zio->io_delta - zio->io_delay)]++; vsx->vsx_disk_histo[type] [L_HISTO(zio->io_delay)]++; vsx->vsx_total_histo[type] [L_HISTO(zio->io_delta)]++; } } mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; /* * If this is an I/O error that is going to be retried, then ignore the * error. Otherwise, the user may interpret B_FAILFAST I/O errors as * hard errors, when in reality they can happen for any number of * innocuous reasons (bus resets, MPxIO link failure, etc). */ if (zio->io_error == EIO && !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; /* * Intent logs writes won't propagate their error to the root * I/O so don't mark these types of failures as pool-level * errors. */ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair * made by zil_claim() during spa_load() in the first txg. * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, * so we commit the DTL change in spa_syncing_txg(spa). * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } int64_t vdev_deflated_space(vdev_t *vd, int64_t space) { ASSERT0((space & (SPA_MINBLOCKSIZE-1))); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); } /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { (void) defer_delta; int64_t dspace_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(vd == vd->vdev_top); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its * children's, thus not accurate enough for us. */ dspace_delta = vdev_deflated_space(vd, space_delta); mutex_enter(&vd->vdev_stat_lock); /* ensure we won't underflow */ if (alloc_delta < 0) { ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta); } vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); /* every class but log contributes to root space stats */ if (vd->vdev_mg != NULL && !vd->vdev_islog) { ASSERT(!vd->vdev_isl2cache); mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } /* Note: metaslab_class_space_update moved to metaslab_space_update */ } /* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. */ void vdev_config_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_writeable(spa)); /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; nvlist_t **aux; uint_t naux; for (c = 0; c < sav->sav_count; c++) { if (sav->sav_vdevs[c] == vd) break; } if (c == sav->sav_count) { /* * We're being removed. There's nothing more to do. */ ASSERT(sav->sav_sync == B_TRUE); return; } sav->sav_sync = B_TRUE; if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_SPARES, &aux, &naux)); } ASSERT(c < naux); /* * Setting the nvlist in the middle if the array is a little * sketchy, but it will work. */ nvlist_free(aux[c]); aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } /* * The dirty list is protected by the SCL_CONFIG lock. The caller * must either hold SCL_CONFIG as writer, or must be the sync thread * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) vdev_config_dirty(rvd->vdev_child[c]); } else { ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && vdev_is_concrete(vd)) { list_insert_head(&spa->spa_config_dirty_list, vd); } } } void vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); ASSERT(list_link_active(&vd->vdev_config_dirty_node)); list_remove(&spa->spa_config_dirty_list, vd); } /* * Mark a top-level vdev's state as dirty, so that the next pass of * spa_sync() can convert this into vdev_config_dirty(). We distinguish * the state changes from larger config changes because they require * much less locking, and are often needed for administrative actions. */ void vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* * The state list is protected by the SCL_STATE lock. The caller * must either hold SCL_STATE as writer, or must be the sync thread * (which holds SCL_STATE as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); if (!list_link_active(&vd->vdev_state_dirty_node) && vdev_is_concrete(vd)) list_insert_head(&spa->spa_state_dirty_list, vd); } void vdev_state_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); ASSERT(list_link_active(&vd->vdev_state_dirty_node)); list_remove(&spa->spa_state_dirty_list, vd); } /* * Propagate vdev state up from children to parent. */ void vdev_propagate_state(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; if (vd->vdev_children > 0) { for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; /* * Don't factor holes or indirect vdevs into the * decision. */ if (!vdev_is_concrete(child)) continue; if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were * degraded. */ if (child->vdev_islog && vd == rvd) degraded++; else faulted++; } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) corrupted++; } vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); /* * Root special: if there is a top-level vdev that cannot be * opened due to corrupted metadata, then propagate the root * vdev's aux state as 'corrupt' rather than 'insufficient * replicas'. */ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } if (vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. * Otherwise, we propagate the change to the parent. * * If this routine places a device in a faulted state, an appropriate ereport is * generated. */ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { /* * Since vdev_offline() code path is already in an offline * state we can miss a statechange event to OFFLINE. Check * the previous state to catch this condition. */ if (vd->vdev_ops->vdev_op_leaf && (state == VDEV_STATE_OFFLINE) && (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) { /* post an offline state change */ zfs_post_state_change(spa, vd, vd->vdev_prevstate); } vd->vdev_stat.vs_aux = aux; return; } save_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; /* * If we are setting the vdev state to anything but an open state, then * always close the underlying device unless the device has requested * a delayed close (i.e. we're about to remove or fault the device). * Otherwise, we keep accessible but invalid devices open forever. * We don't call vdev_close() itself, because that implies some extra * checks (offline, etc) that we don't want here. This is limited to * leaf devices, because otherwise closing the device will affect other * children. */ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { /* * If the previous state is set to VDEV_STATE_REMOVED, then this * device was previously marked removed and someone attempted to * reopen it. If this failed due to a nonexistent device, then * keep the device in the REMOVED state. We also let this be if * it is one of our special test online cases, which is only * attempting to online the device and shouldn't generate an FMA * fault. */ vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import or recovery, we * mark it as "not available", which signifies that it was * never there to begin with. Failure to open such a device * is not considered an error. */ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; /* * Post the appropriate ereport. If the 'prevstate' field is * set to something other than VDEV_STATE_UNKNOWN, it indicates * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. * * If the 'checkremove' flag is set, then this is an attempt to * online the device in response to an insertion event. If we * hit this case, then we have detected an insertion event for a * faulted or offline device that wasn't in the removed state. * In this scenario, we don't post an ereport because we are * about to replace the device, or attempt an online with * vdev_forcefault, which will generate the fault for us. */ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && !vd->vdev_not_present && !vd->vdev_checkremove && vd != spa->spa_root_vdev) { const char *class; switch (aux) { case VDEV_AUX_OPEN_FAILED: class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; break; case VDEV_AUX_CORRUPT_DATA: class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; break; case VDEV_AUX_NO_REPLICAS: class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; break; case VDEV_AUX_BAD_GUID_SUM: class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; break; case VDEV_AUX_TOO_SMALL: class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; break; case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; case VDEV_AUX_BAD_ASHIFT: class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT; break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } (void) zfs_ereport_post(class, spa, vd, NULL, NULL, save_state); } /* Erase any notion of persistent removed state */ vd->vdev_removed = B_FALSE; } else { vd->vdev_removed = B_FALSE; } /* * Notify ZED of any significant state-change on a leaf vdev. * */ if (vd->vdev_ops->vdev_op_leaf) { /* preserve original state from a vdev_reopen() */ if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) && (vd->vdev_prevstate != vd->vdev_state) && (save_state <= VDEV_STATE_CLOSED)) save_state = vd->vdev_prevstate; /* filter out state change due to initial vdev_open */ if (save_state > VDEV_STATE_CLOSED) zfs_post_state_change(spa, vd, save_state); } if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } boolean_t vdev_children_are_offline(vdev_t *vd) { ASSERT(!vd->vdev_ops->vdev_op_leaf); for (uint64_t i = 0; i < vd->vdev_children; i++) { if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) return (B_FALSE); } return (B_TRUE); } /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. We do not support partial configuration. */ boolean_t vdev_is_bootable(vdev_t *vd) { if (!vd->vdev_ops->vdev_op_leaf) { const char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) return (B_FALSE); } for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t vdev_is_concrete(vdev_t *vd) { vdev_ops_t *ops = vd->vdev_ops; if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || ops == &vdev_missing_ops || ops == &vdev_root_ops) { return (B_FALSE); } else { return (B_TRUE); } } /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that * the content on the log device has already been written to the pool. */ boolean_t vdev_log_state_valid(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && !vd->vdev_removed) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_log_state_valid(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Expand a vdev if possible. */ void vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vdev_is_concrete(vd)); vdev_set_deflate_ratio(vd); if ((vd->vdev_spa->spa_raidz_expand == NULL || vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) && (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY0(vdev_metaslab_init(vd, txg)); vdev_config_dirty(vd); } } /* * Split a vdev. */ void vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; VERIFY3U(pvd->vdev_children, >, 1); vdev_remove_child(pvd, vd); vdev_compact_children(pvd); ASSERT3P(pvd->vdev_child, !=, NULL); cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); cvd->vdev_splitting = B_TRUE; } vdev_propagate_state(cvd); } void vdev_deadman(vdev_t *vd, const char *tag) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_deadman(cvd, tag); } if (vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); if (vq->vq_active > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; zfs_dbgmsg("slow vdev: %s has %u active IOs", vd->vdev_path, vq->vq_active); /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ fio = list_head(&vq->vq_active_list); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); } mutex_exit(&vq->vq_lock); } } void vdev_defer_resilver(vdev_t *vd) { ASSERT(vd->vdev_ops->vdev_op_leaf); vd->vdev_resilver_deferred = B_TRUE; vd->vdev_spa->spa_resilver_deferred = B_TRUE; } /* * Clears the resilver deferred flag on all leaf devs under vd. Returns * B_TRUE if we have devices that need to be resilvered and are available to * accept resilver I/Os. */ boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) { boolean_t resilver_needed = B_FALSE; spa_t *spa = vd->vdev_spa; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; resilver_needed |= vdev_clear_resilver_deferred(cvd, tx); } if (vd == spa->spa_root_vdev && spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); vdev_config_dirty(vd); spa->spa_resilver_deferred = B_FALSE; return (resilver_needed); } if (!vdev_is_concrete(vd) || vd->vdev_aux || !vd->vdev_ops->vdev_op_leaf) return (resilver_needed); vd->vdev_resilver_deferred = B_FALSE; return (!vdev_is_dead(vd) && !vd->vdev_offline && vdev_resilver_needed(vd, NULL, NULL)); } boolean_t vdev_xlate_is_empty(zfs_range_seg64_t *rs) { return (rs->rs_start == rs->rs_end); } /* * Translate a logical range to the first contiguous physical range for the * specified vdev_t. This function is initially called with a leaf vdev and * will walk each parent vdev until it reaches a top-level vdev. Once the * top-level is reached the physical range is initialized and the recursive * function begins to unwind. As it unwinds it calls the parent's vdev * specific translation function to do the real conversion. */ void vdev_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) { /* * Walk up the vdev tree */ if (vd != vd->vdev_top) { vdev_xlate(vd->vdev_parent, logical_rs, physical_rs, remain_rs); } else { /* * We've reached the top-level vdev, initialize the physical * range to the logical range and set an empty remaining * range then start to unwind. */ physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; remain_rs->rs_start = logical_rs->rs_start; remain_rs->rs_end = logical_rs->rs_start; return; } vdev_t *pvd = vd->vdev_parent; ASSERT3P(pvd, !=, NULL); ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); /* * As this recursive function unwinds, translate the logical * range into its physical and any remaining components by calling * the vdev specific translate function. */ zfs_range_seg64_t intermediate = { 0 }; pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs); physical_rs->rs_start = intermediate.rs_start; physical_rs->rs_end = intermediate.rs_end; } void vdev_xlate_walk(vdev_t *vd, const zfs_range_seg64_t *logical_rs, vdev_xlate_func_t *func, void *arg) { zfs_range_seg64_t iter_rs = *logical_rs; zfs_range_seg64_t physical_rs; zfs_range_seg64_t remain_rs; while (!vdev_xlate_is_empty(&iter_rs)) { vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs); /* * With raidz and dRAID, it's possible that the logical range * does not live on this leaf vdev. Only when there is a non- * zero physical size call the provided function. */ if (!vdev_xlate_is_empty(&physical_rs)) func(arg, &physical_rs); iter_rs = remain_rs; } } static char * vdev_name(vdev_t *vd, char *buf, int buflen) { if (vd->vdev_path == NULL) { if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) { strlcpy(buf, vd->vdev_spa->spa_name, buflen); } else if (!vd->vdev_ops->vdev_op_leaf) { snprintf(buf, buflen, "%s-%llu", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id); } } else { strlcpy(buf, vd->vdev_path, buflen); } return (buf); } /* * Look at the vdev tree and determine whether any devices are currently being * replaced. */ boolean_t vdev_replace_in_progress(vdev_t *vdev) { ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0); if (vdev->vdev_ops == &vdev_replacing_ops) return (B_TRUE); /* * A 'spare' vdev indicates that we have a replace in progress, unless * it has exactly two children, and the second, the hot spare, has * finished being resilvered. */ if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 || !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING))) return (B_TRUE); for (int i = 0; i < vdev->vdev_children; i++) { if (vdev_replace_in_progress(vdev->vdev_child[i])) return (B_TRUE); } return (B_FALSE); } /* * Add a (source=src, propname=propval) list to an nvlist. */ static void vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval, uint64_t intval, zprop_source_t src) { nvlist_t *propval; propval = fnvlist_alloc(); fnvlist_add_uint64(propval, ZPROP_SOURCE, src); if (strval != NULL) fnvlist_add_string(propval, ZPROP_VALUE, strval); else fnvlist_add_uint64(propval, ZPROP_VALUE, intval); fnvlist_add_nvlist(nvl, propname, propval); nvlist_free(propval); } static void vdev_props_set_sync(void *arg, dmu_tx_t *tx) { vdev_t *vd; nvlist_t *nvp = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; uint64_t vdev_guid; uint64_t objid; nvlist_t *nvprops; vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV); nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS); vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); /* this vdev could get removed while waiting for this sync task */ if (vd == NULL) return; /* * Set vdev property values in the vdev props mos object. */ if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { panic("unexpected vdev type"); } mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { uint64_t intval; const char *strval; vdev_prop_t prop; const char *propname = nvpair_name(elem); zprop_type_t proptype; switch (prop = vdev_name_to_prop(propname)) { case VDEV_PROP_USERPROP: if (vdev_prop_user(propname)) { strval = fnvpair_value_string(elem); if (strlen(strval) == 0) { /* remove the property if value == "" */ (void) zap_remove(mos, objid, propname, tx); } else { VERIFY0(zap_update(mos, objid, propname, 1, strlen(strval) + 1, strval, tx)); } spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%s", (u_longlong_t)vdev_guid, nvpair_name(elem), strval); } break; default: /* normalize the property name */ propname = vdev_prop_to_name(prop); proptype = vdev_prop_get_type(prop); if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); VERIFY0(zap_update(mos, objid, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%s", (u_longlong_t)vdev_guid, nvpair_name(elem), strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY0(vdev_prop_index_to_string( prop, intval, &unused)); } VERIFY0(zap_update(mos, objid, propname, sizeof (uint64_t), 1, &intval, tx)); spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%lld", (u_longlong_t)vdev_guid, nvpair_name(elem), (longlong_t)intval); } else { panic("invalid vdev property type %u", nvpair_type(elem)); } } } mutex_exit(&spa->spa_props_lock); } int vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa = vd->vdev_spa; nvpair_t *elem = NULL; uint64_t vdev_guid; nvlist_t *nvprops; int error = 0; ASSERT(vd != NULL); /* Check that vdev has a zap we can use */ if (vd->vdev_root_zap == 0 && vd->vdev_top_zap == 0 && vd->vdev_leaf_zap == 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS, &nvprops) != 0) return (SET_ERROR(EINVAL)); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) return (SET_ERROR(EINVAL)); while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { const char *propname = nvpair_name(elem); vdev_prop_t prop = vdev_name_to_prop(propname); uint64_t intval = 0; const char *strval = NULL; if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) { error = EINVAL; goto end; } if (prop != VDEV_PROP_USERPROP && vdev_prop_readonly(prop)) { error = EROFS; goto end; } /* Special Processing */ switch (prop) { case VDEV_PROP_PATH: if (vd->vdev_path == NULL) { error = EROFS; break; } if (nvpair_value_string(elem, &strval) != 0) { error = EINVAL; break; } /* New path must start with /dev/ */ if (strncmp(strval, "/dev/", 5)) { error = EINVAL; break; } error = spa_vdev_setpath(spa, vdev_guid, strval); break; case VDEV_PROP_ALLOCATING: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } if (intval != vd->vdev_noalloc) break; if (intval == 0) error = spa_vdev_noalloc(spa, vdev_guid); else error = spa_vdev_alloc(spa, vdev_guid); break; case VDEV_PROP_FAILFAST: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_failfast = intval & 1; break; case VDEV_PROP_SIT_OUT: /* Only expose this for a draid or raidz leaf */ if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_top == NULL || (vd->vdev_top->vdev_ops != &vdev_raidz_ops && vd->vdev_top->vdev_ops != &vdev_draid_ops)) { error = ENOTSUP; break; } if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } if (intval == 1) { vdev_t *ancestor = vd; while (ancestor->vdev_parent != vd->vdev_top) ancestor = ancestor->vdev_parent; vdev_t *pvd = vd->vdev_top; uint_t sitouts = 0; for (int i = 0; i < pvd->vdev_children; i++) { if (pvd->vdev_child[i] == ancestor) continue; if (vdev_sit_out_reads( pvd->vdev_child[i], 0)) { sitouts++; } } if (sitouts >= vdev_get_nparity(pvd)) { error = ZFS_ERR_TOO_MANY_SITOUTS; break; } if (error == 0) vdev_raidz_sit_child(vd, INT64_MAX - gethrestime_sec()); } else { vdev_raidz_unsit_child(vd); } break; case VDEV_PROP_AUTOSIT: if (vd->vdev_ops != &vdev_raidz_ops && vd->vdev_ops != &vdev_draid_ops) { error = ENOTSUP; break; } if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_autosit = intval == 1; break; case VDEV_PROP_CHECKSUM_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_checksum_n = intval; break; case VDEV_PROP_CHECKSUM_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_checksum_t = intval; break; case VDEV_PROP_IO_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_io_n = intval; break; case VDEV_PROP_IO_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_io_t = intval; break; case VDEV_PROP_SLOW_IO_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_slow_io_n = intval; break; case VDEV_PROP_SLOW_IO_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_slow_io_t = intval; break; default: /* Most processing is done in vdev_props_set_sync */ break; } end: if (error != 0) { intval = error; vdev_prop_add_list(outnvl, propname, strval, intval, 0); return (error); } } return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync, innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } int vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; int err = 0; uint64_t objid; uint64_t vdev_guid; nvpair_t *elem = NULL; nvlist_t *nvprops = NULL; uint64_t intval = 0; char *strval = NULL; const char *propname = NULL; vdev_prop_t prop; ASSERT(vd != NULL); ASSERT(mos != NULL); if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops); if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { return (SET_ERROR(EINVAL)); } ASSERT(objid != 0); mutex_enter(&spa->spa_props_lock); if (nvprops != NULL) { char namebuf[64] = { 0 }; while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { intval = 0; strval = NULL; propname = nvpair_name(elem); prop = vdev_name_to_prop(propname); zprop_source_t src = ZPROP_SRC_DEFAULT; uint64_t integer_size, num_integers; switch (prop) { /* Special Read-only Properties */ case VDEV_PROP_NAME: strval = vdev_name(vd, namebuf, sizeof (namebuf)); if (strval == NULL) continue; vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_CAPACITY: /* percent used */ intval = (vd->vdev_stat.vs_dspace == 0) ? 0 : (vd->vdev_stat.vs_alloc * 100 / vd->vdev_stat.vs_dspace); vdev_prop_add_list(outnvl, propname, NULL, intval, ZPROP_SRC_NONE); continue; case VDEV_PROP_STATE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_state, ZPROP_SRC_NONE); continue; case VDEV_PROP_GUID: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_guid, ZPROP_SRC_NONE); continue; case VDEV_PROP_ASIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_asize, ZPROP_SRC_NONE); continue; case VDEV_PROP_PSIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_psize, ZPROP_SRC_NONE); continue; case VDEV_PROP_ASHIFT: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_ashift, ZPROP_SRC_NONE); continue; case VDEV_PROP_SIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE); continue; case VDEV_PROP_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_dspace - vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); continue; case VDEV_PROP_ALLOCATED: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); continue; case VDEV_PROP_EXPANDSZ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_esize, ZPROP_SRC_NONE); continue; case VDEV_PROP_FRAGMENTATION: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_fragmentation, ZPROP_SRC_NONE); continue; case VDEV_PROP_PARITY: vdev_prop_add_list(outnvl, propname, NULL, vdev_get_nparity(vd), ZPROP_SRC_NONE); continue; case VDEV_PROP_PATH: if (vd->vdev_path == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_path, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_DEVID: if (vd->vdev_devid == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_devid, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_PHYS_PATH: if (vd->vdev_physpath == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_physpath, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_ENC_PATH: if (vd->vdev_enc_sysfs_path == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_FRU: if (vd->vdev_fru == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_fru, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_PARENT: if (vd->vdev_parent != NULL) { strval = vdev_name(vd->vdev_parent, namebuf, sizeof (namebuf)); vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); } continue; case VDEV_PROP_CHILDREN: if (vd->vdev_children > 0) strval = kmem_zalloc(ZAP_MAXVALUELEN, KM_SLEEP); for (uint64_t i = 0; i < vd->vdev_children; i++) { const char *vname; vname = vdev_name(vd->vdev_child[i], namebuf, sizeof (namebuf)); if (vname == NULL) vname = "(unknown)"; if (strlen(strval) > 0) strlcat(strval, ",", ZAP_MAXVALUELEN); strlcat(strval, vname, ZAP_MAXVALUELEN); } if (strval != NULL) { vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); kmem_free(strval, ZAP_MAXVALUELEN); } continue; case VDEV_PROP_NUMCHILDREN: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_children, ZPROP_SRC_NONE); continue; case VDEV_PROP_READ_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_read_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_WRITE_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_write_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_CHECKSUM_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_checksum_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_INITIALIZE_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_initialize_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_TRIM_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_trim_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_SLOW_IOS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_slow_ios, ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_NULL: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_NULL], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_READ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_READ], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_WRITE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_FREE], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_CLAIM: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_TRIM: /* * TRIM ops and bytes are reported to user * space as ZIO_TYPE_FLUSH. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_NULL: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_READ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_READ], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_WRITE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_CLAIM: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_TRIM: /* * TRIM ops and bytes are reported to user * space as ZIO_TYPE_FLUSH. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH], ZPROP_SRC_NONE); continue; case VDEV_PROP_REMOVING: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_removing, ZPROP_SRC_NONE); continue; case VDEV_PROP_RAIDZ_EXPANDING: /* Only expose this for raidz */ if (vd->vdev_ops == &vdev_raidz_ops) { vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_rz_expanding, ZPROP_SRC_NONE); } continue; case VDEV_PROP_SIT_OUT: /* Only expose this for a draid or raidz leaf */ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_top != NULL && (vd->vdev_top->vdev_ops == &vdev_raidz_ops || vd->vdev_top->vdev_ops == &vdev_draid_ops)) { vdev_prop_add_list(outnvl, propname, NULL, vdev_sit_out_reads(vd, 0), ZPROP_SRC_NONE); } continue; case VDEV_PROP_TRIM_SUPPORT: /* only valid for leaf vdevs */ if (vd->vdev_ops->vdev_op_leaf) { vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_has_trim, ZPROP_SRC_NONE); } continue; /* Numeric Properites */ case VDEV_PROP_ALLOCATING: /* Leaf vdevs cannot have this property */ if (vd->vdev_mg == NULL && vd->vdev_top != NULL) { src = ZPROP_SRC_NONE; intval = ZPROP_BOOLEAN_NA; } else { err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; } vdev_prop_add_list(outnvl, propname, NULL, intval, src); break; case VDEV_PROP_FAILFAST: src = ZPROP_SRC_LOCAL; strval = NULL; err = zap_lookup(mos, objid, nvpair_name(elem), sizeof (uint64_t), 1, &intval); if (err == ENOENT) { intval = vdev_prop_default_numeric( prop); err = 0; } else if (err) { break; } if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; vdev_prop_add_list(outnvl, propname, strval, intval, src); break; case VDEV_PROP_AUTOSIT: /* Only raidz vdevs cannot have this property */ if (vd->vdev_ops != &vdev_raidz_ops && vd->vdev_ops != &vdev_draid_ops) { src = ZPROP_SRC_NONE; intval = ZPROP_BOOLEAN_NA; } else { err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; } vdev_prop_add_list(outnvl, propname, NULL, intval, src); break; case VDEV_PROP_CHECKSUM_N: case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: case VDEV_PROP_IO_T: case VDEV_PROP_SLOW_IO_N: case VDEV_PROP_SLOW_IO_T: err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; vdev_prop_add_list(outnvl, propname, NULL, intval, src); break; /* Text Properties */ case VDEV_PROP_COMMENT: /* Exists in the ZAP below */ /* FALLTHRU */ case VDEV_PROP_USERPROP: /* User Properites */ src = ZPROP_SRC_LOCAL; err = zap_length(mos, objid, nvpair_name(elem), &integer_size, &num_integers); if (err) break; switch (integer_size) { case 8: /* User properties cannot be integers */ err = EINVAL; break; case 1: /* string property */ strval = kmem_alloc(num_integers, KM_SLEEP); err = zap_lookup(mos, objid, nvpair_name(elem), 1, num_integers, strval); if (err) { kmem_free(strval, num_integers); break; } vdev_prop_add_list(outnvl, propname, strval, 0, src); kmem_free(strval, num_integers); break; } break; default: err = ENOENT; break; } if (err) break; } } else { /* * Get all properties from the MOS vdev property object. */ zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, mos, objid); (err = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { intval = 0; strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; propname = za->za_name; switch (za->za_integer_length) { case 8: /* We do not allow integer user properties */ /* This is likely an internal value */ break; case 1: /* string property */ strval = kmem_alloc(za->za_num_integers, KM_SLEEP); err = zap_lookup(mos, objid, za->za_name, 1, za->za_num_integers, strval); if (err) { kmem_free(strval, za->za_num_integers); break; } vdev_prop_add_list(outnvl, propname, strval, 0, src); kmem_free(strval, za->za_num_integers); break; default: break; } } zap_cursor_fini(&zc); zap_attribute_free(za); } mutex_exit(&spa->spa_props_lock); if (err && err != ENOENT) { return (err); } return (0); } EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW, "Target number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW, "Default lower limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW, "Default upper limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW, "Minimum number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW, "Practical upper limit of total metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, "Rate limit slow IO (delay) events to this many per second"); ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW, "Rate limit hung IO (deadman) events to this many per second"); ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW, "Rate Direct I/O write verify events to this many per second"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW, "Direct I/O writes will perform for checksum verification before " "commiting write"); ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " "(do not set below ZED threshold)."); ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW, "Ignore errors during resilver/scrub"); ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW, "Bypass vdev_validate()"); ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, "Disable cache flushes"); ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW, "Minimum number of metaslabs required to dedicate one for log blocks"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, param_set_min_auto_ashift, param_get_uint, ZMOD_RW, "Minimum ashift used when creating new top-level vdevs"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, param_set_max_auto_ashift, param_get_uint, ZMOD_RW, "Maximum ashift used when optimizing for logical -> physical sector " "size on new top-level vdevs"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, raidz_impl, param_set_raidz_impl, param_get_raidz_impl, ZMOD_RW, "RAIDZ implementation"); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index c44f654b0261..0d4fdaa77ba0 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1,2168 +1,2170 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ /* * Virtual Device Labels * --------------------- * * The vdev label serves several distinct purposes: * * 1. Uniquely identify this device as part of a ZFS pool and confirm its * identity within the pool. * * 2. Verify that all the devices given in a configuration are present * within the pool. * * 3. Determine the uberblock for the pool. * * 4. In case of an import operation, determine the configuration of the * toplevel vdev of which it is a part. * * 5. If an import operation cannot find all the devices in the pool, * provide enough information to the administrator to determine which * devices are missing. * * It is important to note that while the kernel is responsible for writing the * label, it only consumes the information in the first three cases. The * latter information is only consumed in userland when determining the * configuration to import a pool. * * * Label Organization * ------------------ * * Before describing the contents of the label, it's important to understand how * the labels are written and updated with respect to the uberblock. * * When the pool configuration is altered, either because it was newly created * or a device was added, we want to update all the labels such that we can deal * with fatal failure at any point. To this end, each disk has two labels which * are updated before and after the uberblock is synced. Assuming we have * labels and an uberblock with the following transaction groups: * * L1 UB L2 * +------+ +------+ +------+ * | | | | | | * | t10 | | t10 | | t10 | * | | | | | | * +------+ +------+ +------+ * * In this stable state, the labels and the uberblock were all updated within * the same transaction group (10). Each label is mirrored and checksummed, so * that we can detect when we fail partway through writing the label. * * In order to identify which labels are valid, the labels are written in the * following manner: * * 1. For each vdev, update 'L1' to the new label * 2. Update the uberblock * 3. For each vdev, update 'L2' to the new label * * Given arbitrary failure, we can determine the correct label to use based on * the transaction group. If we fail after updating L1 but before updating the * UB, we will notice that L1's transaction group is greater than the uberblock, * so L2 must be valid. If we fail after writing the uberblock but before * writing L2, we will notice that L2's transaction group is less than L1, and * therefore L1 is valid. * * Another added complexity is that not every label is updated when the config * is synced. If we add a single device, we do not want to have to re-write * every label for every device in the pool. This means that both L1 and L2 may * be older than the pool uberblock, because the necessary information is stored * on another vdev. * * * On-disk Format * -------------- * * The vdev label consists of two distinct parts, and is wrapped within the * vdev_label_t structure. The label includes 8k of padding to permit legacy * VTOC disk labels, but is otherwise ignored. * * The first half of the label is a packed nvlist which contains pool wide * properties, per-vdev properties, and configuration information. It is * described in more detail below. * * The latter half of the label consists of a redundant array of uberblocks. * These uberblocks are updated whenever a transaction group is committed, * or when the configuration is updated. When a pool is loaded, we scan each * vdev for the 'best' uberblock. * * * Configuration Information * ------------------------- * * The nvlist describing the pool and vdev contains the following elements: * * version ZFS on-disk version * name Pool name * state Pool state * txg Transaction group in which this label was written * pool_guid Unique identifier for this pool * vdev_tree An nvlist describing vdev tree. * features_for_read * An nvlist of the features necessary for reading the MOS. * * Each leaf device label also contains the following: * * top_guid Unique ID for top-level vdev in which this is contained * guid Unique ID for the leaf vdev * * The 'vs' configuration follows the format described in 'spa_config.c'. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Basic routines to read and write from a vdev label. * Used throughout the rest of this file. */ uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset) { ASSERT(offset < sizeof (vdev_label_t)); ASSERT0(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t)); return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? 0 : psize - VDEV_LABELS * sizeof (vdev_label_t))); } /* * Returns back the vdev label associated with the passed in offset. */ int vdev_label_number(uint64_t psize, uint64_t offset) { int l; if (offset >= psize - VDEV_LABEL_END_SIZE) { offset -= psize - VDEV_LABEL_END_SIZE; offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t); } l = offset / sizeof (vdev_label_t); return (l < VDEV_LABELS ? l : -1); } static void vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT( spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE || spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE); ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); zio_nowait(zio_read_phys(zio, vd, vdev_label_offset(vd->vdev_psize, l, offset), size, buf, ZIO_CHECKSUM_LABEL, done, private, ZIO_PRIORITY_SYNC_READ, flags, B_TRUE)); } void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT( spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE || spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE); ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); zio_nowait(zio_write_phys(zio, vd, vdev_label_offset(vd->vdev_psize, l, offset), size, buf, ZIO_CHECKSUM_LABEL, done, private, ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE)); } /* * Generate the nvlist representing this vdev's stats */ void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) { nvlist_t *nvx; vdev_stat_t *vs; vdev_stat_ex_t *vsx; vs = kmem_alloc(sizeof (*vs), KM_SLEEP); vsx = kmem_alloc(sizeof (*vsx), KM_SLEEP); vdev_get_stats_ex(vd, vs, vsx); fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t *)vs, sizeof (*vs) / sizeof (uint64_t)); /* * Add extended stats into a special extended stats nvlist. This keeps * all the extended stats nicely grouped together. The extended stats * nvlist is then added to the main nvlist. */ nvx = fnvlist_alloc(); /* ZIOs in flight to disk */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_READ]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_WRITE]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_READ]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_WRITE]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_TRIM]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_REBUILD]); /* ZIOs pending */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_WRITE]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_READ]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_WRITE]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_TRIM]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_REBUILD]); /* Histograms */ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, vsx->vsx_total_histo[ZIO_TYPE_READ], ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, vsx->vsx_total_histo[ZIO_TYPE_WRITE], ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, vsx->vsx_disk_histo[ZIO_TYPE_READ], ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, vsx->vsx_disk_histo[ZIO_TYPE_WRITE], ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_REBUILD], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_REBUILD])); /* Request sizes */ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_REBUILD], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_REBUILD])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_REBUILD], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_REBUILD])); /* IO delays */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios); /* Direct I/O write verify errors */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_DIO_VERIFY_ERRORS, vs->vs_dio_verify_errors); /* Add extended stats nvlist to main nvlist */ fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx); fnvlist_free(nvx); kmem_free(vs, sizeof (*vs)); kmem_free(vsx, sizeof (*vsx)); } static void root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) { spa_t *spa = vd->vdev_spa; if (vd != spa->spa_root_vdev) return; /* provide either current or previous scan information */ pool_scan_stat_t ps; if (spa_scan_get_stats(spa, &ps) == 0) { fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, sizeof (pool_scan_stat_t) / sizeof (uint64_t)); } pool_removal_stat_t prs; if (spa_removal_get_stats(spa, &prs) == 0) { fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs, sizeof (prs) / sizeof (uint64_t)); } pool_checkpoint_stat_t pcs; if (spa_checkpoint_get_stats(spa, &pcs) == 0) { fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs, sizeof (pcs) / sizeof (uint64_t)); } pool_raidz_expand_stat_t pres; if (spa_raidz_expand_get_stats(spa, &pres) == 0) { fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t *)&pres, sizeof (pres) / sizeof (uint64_t)); } } static void top_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) { if (vd == vd->vdev_top) { vdev_rebuild_stat_t vrs; if (vdev_rebuild_get_stats(vd, &vrs) == 0) { fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_REBUILD_STATS, (uint64_t *)&vrs, sizeof (vrs) / sizeof (uint64_t)); } } } /* * Generate the nvlist representing this vdev's config. */ nvlist_t * vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_flag_t flags) { nvlist_t *nv = NULL; vdev_indirect_config_t *vic = &vd->vdev_indirect_config; nv = fnvlist_alloc(); fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type); if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE))) fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id); fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid); if (vd->vdev_path != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path); if (vd->vdev_devid != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid); if (vd->vdev_physpath != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, vd->vdev_physpath); if (vd->vdev_enc_sysfs_path != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, vd->vdev_enc_sysfs_path); if (vd->vdev_fru != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); if (vd->vdev_ops->vdev_op_config_generate != NULL) vd->vdev_ops->vdev_op_config_generate(vd, nv); if (vd->vdev_wholedisk != -1ULL) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, vd->vdev_wholedisk); } if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING)) fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); if (vd->vdev_isspare) fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); if (flags & VDEV_CONFIG_L2CACHE) fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && vd == vd->vdev_top) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, vd->vdev_ms_array); fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, vd->vdev_ms_shift); fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_MIN_ALLOC, + vdev_get_min_alloc(vd)); fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); if (vd->vdev_noalloc) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, vd->vdev_noalloc); } /* * Slog devices are removed synchronously so don't * persist the vdev_removing flag to the label. */ if (vd->vdev_removing && !vd->vdev_islog) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, vd->vdev_removing); } /* zpool command expects alloc class data */ if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) { const char *bias = NULL; switch (vd->vdev_alloc_bias) { case VDEV_BIAS_LOG: bias = VDEV_ALLOC_BIAS_LOG; break; case VDEV_BIAS_SPECIAL: bias = VDEV_ALLOC_BIAS_SPECIAL; break; case VDEV_BIAS_DEDUP: bias = VDEV_ALLOC_BIAS_DEDUP; break; default: ASSERT3U(vd->vdev_alloc_bias, ==, VDEV_BIAS_NONE); } fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, bias); } } if (vd->vdev_dtl_sm != NULL) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, space_map_object(vd->vdev_dtl_sm)); } if (vic->vic_mapping_object != 0) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, vic->vic_mapping_object); } if (vic->vic_births_object != 0) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, vic->vic_births_object); } if (vic->vic_prev_indirect_vdev != UINT64_MAX) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, vic->vic_prev_indirect_vdev); } if (vd->vdev_crtxg) fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); if (vd->vdev_expansion_time) fnvlist_add_uint64(nv, ZPOOL_CONFIG_EXPANSION_TIME, vd->vdev_expansion_time); if (flags & VDEV_CONFIG_MOS) { if (vd->vdev_leaf_zap != 0) { ASSERT(vd->vdev_ops->vdev_op_leaf); fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, vd->vdev_leaf_zap); } if (vd->vdev_top_zap != 0) { ASSERT(vd == vd->vdev_top); fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, vd->vdev_top_zap); } if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap != 0 && spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP, vd->vdev_root_zap); } if (vd->vdev_resilver_deferred) { ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(spa->spa_resilver_deferred); fnvlist_add_boolean(nv, ZPOOL_CONFIG_RESILVER_DEFER); } } if (getstats) { vdev_config_generate_stats(vd, nv); root_vdev_actions_getprogress(vd, nv); top_vdev_actions_getprogress(vd, nv); /* * Note: this can be called from open context * (spa_get_stats()), so we need the rwlock to prevent * the mapping from being changed by condensing. */ rw_enter(&vd->vdev_indirect_rwlock, RW_READER); if (vd->vdev_indirect_mapping != NULL) { ASSERT(vd->vdev_indirect_births != NULL); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, vdev_indirect_mapping_size(vim)); } rw_exit(&vd->vdev_indirect_rwlock); if (vd->vdev_mg != NULL && vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) { /* * Compute approximately how much memory would be used * for the indirect mapping if this device were to * be removed. * * Note: If the frag metric is invalid, then not * enough metaslabs have been converted to have * histograms. */ uint64_t seg_count = 0; uint64_t to_alloc = vd->vdev_stat.vs_alloc; /* * There are the same number of allocated segments * as free segments, so we will have at least one * entry per free segment. However, small free * segments (smaller than vdev_removal_max_span) * will be combined with adjacent allocated segments * as a single mapping. */ for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { if (i + 1 < highbit64(vdev_removal_max_span) - 1) { to_alloc += vd->vdev_mg->mg_histogram[i] << (i + 1); } else { seg_count += vd->vdev_mg->mg_histogram[i]; } } /* * The maximum length of a mapping is * zfs_remove_max_segment, so we need at least one entry * per zfs_remove_max_segment of allocated data. */ seg_count += to_alloc / spa_remove_max_segment(spa); fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, seg_count * sizeof (vdev_indirect_mapping_entry_phys_t)); } } if (!vd->vdev_ops->vdev_op_leaf) { nvlist_t **child; uint64_t c; ASSERT(!vd->vdev_ishole); child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); for (c = 0; c < vd->vdev_children; c++) { child[c] = vdev_config_generate(spa, vd->vdev_child[c], getstats, flags); } fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, (const nvlist_t * const *)child, vd->vdev_children); for (c = 0; c < vd->vdev_children; c++) nvlist_free(child[c]); kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); } else { const char *aux = NULL; if (vd->vdev_offline && !vd->vdev_tmpoffline) fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE); if (vd->vdev_resilver_txg != 0) fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, vd->vdev_resilver_txg); if (vd->vdev_rebuild_txg != 0) fnvlist_add_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, vd->vdev_rebuild_txg); if (vd->vdev_faulted) fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE); if (vd->vdev_degraded) fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE); if (vd->vdev_removed) fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE); if (vd->vdev_unspare) fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE); if (vd->vdev_ishole) fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE); /* Set the reason why we're FAULTED/DEGRADED. */ switch (vd->vdev_stat.vs_aux) { case VDEV_AUX_ERR_EXCEEDED: aux = "err_exceeded"; break; case VDEV_AUX_EXTERNAL: aux = "external"; break; } if (aux != NULL && !vd->vdev_tmpoffline) { fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux); } else { /* * We're healthy - clear any previous AUX_STATE values. */ if (nvlist_exists(nv, ZPOOL_CONFIG_AUX_STATE)) nvlist_remove_all(nv, ZPOOL_CONFIG_AUX_STATE); } if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, vd->vdev_orig_guid); } } return (nv); } /* * Generate a view of the top-level vdevs. If we currently have holes * in the namespace, then generate an array which contains a list of holey * vdevs. Additionally, add the number of top-level children that currently * exist. */ void vdev_top_config_generate(spa_t *spa, nvlist_t *config) { vdev_t *rvd = spa->spa_root_vdev; uint64_t *array; uint_t c, idx; array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP); for (c = 0, idx = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_ishole) { array[idx++] = c; } } if (idx) { VERIFY0(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY, array, idx)); } VERIFY0(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, rvd->vdev_children)); kmem_free(array, rvd->vdev_children * sizeof (uint64_t)); } /* * Returns the configuration from the label of the given vdev. For vdevs * which don't have a txg value stored on their label (i.e. spares/cache) * or have not been completely initialized (txg = 0) just return * the configuration from the first valid label we find. Otherwise, * find the most up-to-date label that does not exceed the specified * 'txg' value. */ nvlist_t * vdev_label_read_config(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; vdev_phys_t *vp[VDEV_LABELS]; abd_t *vp_abd[VDEV_LABELS]; zio_t *zio[VDEV_LABELS]; uint64_t best_txg = 0; uint64_t label_txg = 0; int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; ASSERT(vd->vdev_validate_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (!vdev_readable(vd)) return (NULL); /* * The label for a dRAID distributed spare is not stored on disk. * Instead it is generated when needed which allows us to bypass * the pipeline when reading the config from the label. */ if (vd->vdev_ops == &vdev_draid_spare_ops) return (vdev_draid_read_config_spare(vd)); for (int l = 0; l < VDEV_LABELS; l++) { vp_abd[l] = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); vp[l] = abd_to_buf(vp_abd[l]); } retry: for (int l = 0; l < VDEV_LABELS; l++) { zio[l] = zio_root(spa, NULL, NULL, flags); vdev_label_read(zio[l], vd, l, vp_abd[l], offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); } for (int l = 0; l < VDEV_LABELS; l++) { nvlist_t *label = NULL; if (zio_wait(zio[l]) == 0 && nvlist_unpack(vp[l]->vp_nvlist, sizeof (vp[l]->vp_nvlist), &label, 0) == 0) { /* * Auxiliary vdevs won't have txg values in their * labels and newly added vdevs may not have been * completely initialized so just return the * configuration from the first valid label we * encounter. */ error = nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, &label_txg); if ((error || label_txg == 0) && !config) { config = label; for (l++; l < VDEV_LABELS; l++) zio_wait(zio[l]); break; } else if (label_txg <= txg && label_txg > best_txg) { best_txg = label_txg; nvlist_free(config); config = fnvlist_dup(label); } } if (label != NULL) { nvlist_free(label); label = NULL; } } if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) { flags |= ZIO_FLAG_TRYHARD; goto retry; } /* * We found a valid label but it didn't pass txg restrictions. */ if (config == NULL && label_txg != 0) { vdev_dbgmsg(vd, "label discarded as txg is too large " "(%llu > %llu)", (u_longlong_t)label_txg, (u_longlong_t)txg); } for (int l = 0; l < VDEV_LABELS; l++) { abd_free(vp_abd[l]); } return (config); } /* * Determine if a device is in use. The 'spare_guid' parameter will be filled * in with the device guid if this spare is active elsewhere on the system. */ static boolean_t vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, uint64_t *spare_guid, uint64_t *l2cache_guid) { spa_t *spa = vd->vdev_spa; uint64_t state, pool_guid, device_guid, txg, spare_pool; uint64_t vdtxg = 0; nvlist_t *label; if (spare_guid) *spare_guid = 0ULL; if (l2cache_guid) *l2cache_guid = 0ULL; /* * Read the label, if any, and perform some basic sanity checks. */ if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) return (B_FALSE); (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, &vdtxg); if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0 || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &device_guid) != 0) { nvlist_free(label); return (B_FALSE); } if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0 || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, &txg) != 0)) { nvlist_free(label); return (B_FALSE); } nvlist_free(label); /* * Check to see if this device indeed belongs to the pool it claims to * be a part of. The only way this is allowed is if the device is a hot * spare (which we check for later on). */ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && !spa_guid_exists(pool_guid, device_guid) && !spa_spare_exists(device_guid, NULL, NULL) && !spa_l2cache_exists(device_guid, NULL)) return (B_FALSE); /* * If the transaction group is zero, then this an initialized (but * unused) label. This is only an error if the create transaction * on-disk is the same as the one we're using now, in which case the * user has attempted to add the same vdev multiple times in the same * transaction. */ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && txg == 0 && vdtxg == crtxg) return (B_TRUE); /* * Check to see if this is a spare device. We do an explicit check for * spa_has_spare() here because it may be on our pending list of spares * to add. */ if (spa_spare_exists(device_guid, &spare_pool, NULL) || spa_has_spare(spa, device_guid)) { if (spare_guid) *spare_guid = device_guid; switch (reason) { case VDEV_LABEL_CREATE: return (B_TRUE); case VDEV_LABEL_REPLACE: return (!spa_has_spare(spa, device_guid) || spare_pool != 0ULL); case VDEV_LABEL_SPARE: return (spa_has_spare(spa, device_guid)); default: break; } } /* * Check to see if this is an l2cache device. */ if (spa_l2cache_exists(device_guid, NULL) || spa_has_l2cache(spa, device_guid)) { if (l2cache_guid) *l2cache_guid = device_guid; switch (reason) { case VDEV_LABEL_CREATE: return (B_TRUE); case VDEV_LABEL_REPLACE: return (!spa_has_l2cache(spa, device_guid)); case VDEV_LABEL_L2CACHE: return (spa_has_l2cache(spa, device_guid)); default: break; } } /* * We can't rely on a pool's state if it's been imported * read-only. Instead we look to see if the pools is marked * read-only in the namespace and set the state to active. */ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (spa = spa_by_guid(pool_guid, device_guid)) != NULL && spa_mode(spa) == SPA_MODE_READ) state = POOL_STATE_ACTIVE; /* * If the device is marked ACTIVE, then this device is in use by another * pool on the system. */ return (state == POOL_STATE_ACTIVE); } static nvlist_t * vdev_aux_label_generate(vdev_t *vd, boolean_t reason_spare) { /* * For inactive hot spares and level 2 ARC devices, we generate * a special label that identifies as a mutually shared hot * spare or l2cache device. We write the label in case of * addition or removal of hot spare or l2cache vdev (in which * case we want to revert the labels). */ nvlist_t *label = fnvlist_alloc(); fnvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, spa_version(vd->vdev_spa)); fnvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, reason_spare ? POOL_STATE_SPARE : POOL_STATE_L2CACHE); fnvlist_add_uint64(label, ZPOOL_CONFIG_GUID, vd->vdev_guid); /* * This is merely to facilitate reporting the ashift of the * cache device through zdb. The actual retrieval of the * ashift (in vdev_alloc()) uses the nvlist * spa->spa_l2cache->sav_config (populated in * spa_ld_open_aux_vdevs()). */ if (!reason_spare) fnvlist_add_uint64(label, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); /* * Add path information to help find it during pool import */ if (vd->vdev_path != NULL) fnvlist_add_string(label, ZPOOL_CONFIG_PATH, vd->vdev_path); if (vd->vdev_devid != NULL) fnvlist_add_string(label, ZPOOL_CONFIG_DEVID, vd->vdev_devid); if (vd->vdev_physpath != NULL) { fnvlist_add_string(label, ZPOOL_CONFIG_PHYS_PATH, vd->vdev_physpath); } return (label); } /* * Initialize a vdev label. We check to make sure each leaf device is not in * use, and writable. We put down an initial label which we will later * overwrite with a complete label. Note that it's important to do this * sequentially, not in parallel, so that we catch cases of multiple use of the * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with * itself. */ int vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) { spa_t *spa = vd->vdev_spa; nvlist_t *label; vdev_phys_t *vp; abd_t *vp_abd; abd_t *bootenv; uberblock_t *ub; abd_t *ub_abd; zio_t *zio; char *buf; size_t buflen; int error; uint64_t spare_guid = 0, l2cache_guid = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; boolean_t reason_spare = (reason == VDEV_LABEL_SPARE || (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)); boolean_t reason_l2cache = (reason == VDEV_LABEL_L2CACHE || (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); for (int c = 0; c < vd->vdev_children; c++) if ((error = vdev_label_init(vd->vdev_child[c], crtxg, reason)) != 0) return (error); /* Track the creation time for this vdev */ vd->vdev_crtxg = crtxg; if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa)) return (0); /* * Dead vdevs cannot be initialized. */ if (vdev_is_dead(vd)) return (SET_ERROR(EIO)); /* * Determine if the vdev is in use. */ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT && vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) return (SET_ERROR(EBUSY)); /* * If this is a request to add or replace a spare or l2cache device * that is in use elsewhere on the system, then we must update the * guid (which was initialized to a random value) to reflect the * actual GUID (which is shared between multiple pools). */ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE && spare_guid != 0ULL) { uint64_t guid_delta = spare_guid - vd->vdev_guid; vd->vdev_guid += guid_delta; for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += guid_delta; /* * If this is a replacement, then we want to fallthrough to the * rest of the code. If we're adding a spare, then it's already * labeled appropriately and we can just return. */ if (reason == VDEV_LABEL_SPARE) return (0); ASSERT(reason == VDEV_LABEL_REPLACE || reason == VDEV_LABEL_SPLIT); } if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE && l2cache_guid != 0ULL) { uint64_t guid_delta = l2cache_guid - vd->vdev_guid; vd->vdev_guid += guid_delta; for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += guid_delta; /* * If this is a replacement, then we want to fallthrough to the * rest of the code. If we're adding an l2cache, then it's * already labeled appropriately and we can just return. */ if (reason == VDEV_LABEL_L2CACHE) return (0); ASSERT(reason == VDEV_LABEL_REPLACE); } /* * Initialize its label. */ vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); abd_zero(vp_abd, sizeof (vdev_phys_t)); vp = abd_to_buf(vp_abd); /* * Generate a label describing the pool and our top-level vdev. * We mark it as being from txg 0 to indicate that it's not * really part of an active pool just yet. The labels will * be written again with a meaningful txg by spa_sync(). */ if (reason_spare || reason_l2cache) { label = vdev_aux_label_generate(vd, reason_spare); /* * When spare or l2cache (aux) vdev is added during pool * creation, spa->spa_uberblock is not written until this * point. Write it on next config sync. */ if (uberblock_verify(&spa->spa_uberblock)) spa->spa_aux_sync_uber = B_TRUE; } else { uint64_t txg = 0ULL; if (reason == VDEV_LABEL_SPLIT) txg = spa->spa_uberblock.ub_txg; label = spa_config_generate(spa, vd, txg, B_FALSE); /* * Add our creation time. This allows us to detect multiple * vdev uses as described above, and automatically expires if we * fail. */ VERIFY0(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG, crtxg)); } buf = vp->vp_nvlist; buflen = sizeof (vp->vp_nvlist); error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); if (error != 0) { nvlist_free(label); abd_free(vp_abd); /* EFAULT means nvlist_pack ran out of room */ return (SET_ERROR(error == EFAULT ? ENAMETOOLONG : EINVAL)); } /* * Initialize uberblock template. */ ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE); abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t)); abd_zero_off(ub_abd, sizeof (uberblock_t), VDEV_UBERBLOCK_RING - sizeof (uberblock_t)); ub = abd_to_buf(ub_abd); ub->ub_txg = 0; /* Initialize the 2nd padding area. */ bootenv = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); abd_zero(bootenv, VDEV_PAD_SIZE); /* * Write everything in parallel. */ retry: zio = zio_root(spa, NULL, NULL, flags); for (int l = 0; l < VDEV_LABELS; l++) { vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); /* * Skip the 1st padding area. * Zero out the 2nd padding area where it might have * left over data from previous filesystem format. */ vdev_label_write(zio, vd, l, bootenv, offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, NULL, NULL, flags); vdev_label_write(zio, vd, l, ub_abd, offsetof(vdev_label_t, vl_uberblock), VDEV_UBERBLOCK_RING, NULL, NULL, flags); } error = zio_wait(zio); if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { flags |= ZIO_FLAG_TRYHARD; goto retry; } nvlist_free(label); abd_free(bootenv); abd_free(ub_abd); abd_free(vp_abd); /* * If this vdev hasn't been previously identified as a spare, then we * mark it as such only if a) we are labeling it as a spare, or b) it * exists as a spare elsewhere in the system. Do the same for * level 2 ARC devices. */ if (error == 0 && !vd->vdev_isspare && (reason == VDEV_LABEL_SPARE || spa_spare_exists(vd->vdev_guid, NULL, NULL))) spa_spare_add(vd); if (error == 0 && !vd->vdev_isl2cache && (reason == VDEV_LABEL_L2CACHE || spa_l2cache_exists(vd->vdev_guid, NULL))) spa_l2cache_add(vd); return (error); } /* * Done callback for vdev_label_read_bootenv_impl. If this is the first * callback to finish, store our abd in the callback pointer. Otherwise, we * just free our abd and return. */ static void vdev_label_read_bootenv_done(zio_t *zio) { zio_t *rio = zio->io_private; abd_t **cbp = rio->io_private; ASSERT3U(zio->io_size, ==, VDEV_PAD_SIZE); if (zio->io_error == 0) { mutex_enter(&rio->io_lock); if (*cbp == NULL) { /* Will free this buffer in vdev_label_read_bootenv. */ *cbp = zio->io_abd; } else { abd_free(zio->io_abd); } mutex_exit(&rio->io_lock); } else { abd_free(zio->io_abd); } } static void vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags) { for (int c = 0; c < vd->vdev_children; c++) vdev_label_read_bootenv_impl(zio, vd->vdev_child[c], flags); /* * We just use the first label that has a correct checksum; the * bootloader should have rewritten them all to be the same on boot, * and any changes we made since boot have been the same across all * labels. */ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { for (int l = 0; l < VDEV_LABELS; l++) { vdev_label_read(zio, vd, l, abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE), offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, vdev_label_read_bootenv_done, zio, flags); } } } int vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv) { nvlist_t *config; spa_t *spa = rvd->vdev_spa; abd_t *abd = NULL; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; ASSERT(bootenv); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); zio_t *zio = zio_root(spa, NULL, &abd, flags); vdev_label_read_bootenv_impl(zio, rvd, flags); int err = zio_wait(zio); if (abd != NULL) { char *buf; vdev_boot_envblock_t *vbe = abd_to_buf(abd); vbe->vbe_version = ntohll(vbe->vbe_version); switch (vbe->vbe_version) { case VB_RAW: /* * if we have textual data in vbe_bootenv, create nvlist * with key "envmap". */ fnvlist_add_uint64(bootenv, BOOTENV_VERSION, VB_RAW); vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0'; fnvlist_add_string(bootenv, GRUB_ENVMAP, vbe->vbe_bootenv); break; case VB_NVLIST: err = nvlist_unpack(vbe->vbe_bootenv, sizeof (vbe->vbe_bootenv), &config, 0); if (err == 0) { fnvlist_merge(bootenv, config); nvlist_free(config); break; } zfs_fallthrough; default: /* Check for FreeBSD zfs bootonce command string */ buf = abd_to_buf(abd); if (*buf == '\0') { fnvlist_add_uint64(bootenv, BOOTENV_VERSION, VB_NVLIST); break; } fnvlist_add_string(bootenv, FREEBSD_BOOTONCE, buf); } /* * abd was allocated in vdev_label_read_bootenv_impl() */ abd_free(abd); /* * If we managed to read any successfully, * return success. */ return (0); } return (err); } int vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env) { zio_t *zio; spa_t *spa = vd->vdev_spa; vdev_boot_envblock_t *bootenv; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; int error; size_t nvsize; char *nvbuf; const char *tmp; error = nvlist_size(env, &nvsize, NV_ENCODE_XDR); if (error != 0) return (SET_ERROR(error)); if (nvsize >= sizeof (bootenv->vbe_bootenv)) { return (SET_ERROR(E2BIG)); } ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); error = ENXIO; for (int c = 0; c < vd->vdev_children; c++) { int child_err; child_err = vdev_label_write_bootenv(vd->vdev_child[c], env); /* * As long as any of the disks managed to write all of their * labels successfully, return success. */ if (child_err == 0) error = child_err; } if (!vd->vdev_ops->vdev_op_leaf || vdev_is_dead(vd) || !vdev_writeable(vd)) { return (error); } ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE); abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); abd_zero(abd, VDEV_PAD_SIZE); bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE); nvbuf = bootenv->vbe_bootenv; nvsize = sizeof (bootenv->vbe_bootenv); bootenv->vbe_version = fnvlist_lookup_uint64(env, BOOTENV_VERSION); switch (bootenv->vbe_version) { case VB_RAW: if (nvlist_lookup_string(env, GRUB_ENVMAP, &tmp) == 0) { (void) strlcpy(bootenv->vbe_bootenv, tmp, nvsize); } error = 0; break; case VB_NVLIST: error = nvlist_pack(env, &nvbuf, &nvsize, NV_ENCODE_XDR, KM_SLEEP); break; default: error = EINVAL; break; } if (error == 0) { bootenv->vbe_version = htonll(bootenv->vbe_version); abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE); } else { abd_free(abd); return (SET_ERROR(error)); } retry: zio = zio_root(spa, NULL, NULL, flags); for (int l = 0; l < VDEV_LABELS; l++) { vdev_label_write(zio, vd, l, abd, offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, NULL, NULL, flags); } error = zio_wait(zio); if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { flags |= ZIO_FLAG_TRYHARD; goto retry; } abd_free(abd); return (error); } /* * ========================================================================== * uberblock load/sync * ========================================================================== */ /* * Consider the following situation: txg is safely synced to disk. We've * written the first uberblock for txg + 1, and then we lose power. When we * come back up, we fail to see the uberblock for txg + 1 because, say, * it was on a mirrored device and the replica to which we wrote txg + 1 * is now offline. If we then make some changes and sync txg + 1, and then * the missing replica comes back, then for a few seconds we'll have two * conflicting uberblocks on disk with the same txg. The solution is simple: * among uberblocks with equal txg, choose the one with the latest timestamp. */ static int vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) { int cmp = TREE_CMP(ub1->ub_txg, ub2->ub_txg); if (likely(cmp)) return (cmp); cmp = TREE_CMP(ub1->ub_timestamp, ub2->ub_timestamp); if (likely(cmp)) return (cmp); /* * If MMP_VALID(ub) && MMP_SEQ_VALID(ub) then the host has an MMP-aware * ZFS, e.g. OpenZFS >= 0.7. * * If one ub has MMP and the other does not, they were written by * different hosts, which matters for MMP. So we treat no MMP/no SEQ as * a 0 value. * * Since timestamp and txg are the same if we get this far, either is * acceptable for importing the pool. */ unsigned int seq1 = 0; unsigned int seq2 = 0; if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1)) seq1 = MMP_SEQ(ub1); if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2)) seq2 = MMP_SEQ(ub2); return (TREE_CMP(seq1, seq2)); } struct ubl_cbdata { uberblock_t ubl_latest; /* Most recent uberblock */ uberblock_t *ubl_ubbest; /* Best uberblock (w/r/t max_txg) */ vdev_t *ubl_vd; /* vdev associated with the above */ }; static void vdev_uberblock_load_done(zio_t *zio) { vdev_t *vd = zio->io_vd; spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; uberblock_t *ub = abd_to_buf(zio->io_abd); struct ubl_cbdata *cbp = rio->io_private; ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); if (vdev_uberblock_compare(ub, &cbp->ubl_latest) > 0) { cbp->ubl_latest = *ub; } if (ub->ub_txg <= spa->spa_load_max_txg && vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { /* * Keep track of the vdev in which this uberblock * was found. We will use this information later * to obtain the config nvlist associated with * this uberblock. */ *cbp->ubl_ubbest = *ub; cbp->ubl_vd = vd; } mutex_exit(&rio->io_lock); } abd_free(zio->io_abd); } static void vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, struct ubl_cbdata *cbp) { for (int c = 0; c < vd->vdev_children; c++) vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) && vd->vdev_ops != &vdev_draid_spare_ops) { for (int l = 0; l < VDEV_LABELS; l++) { for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_load_done, zio, flags); } } } } /* * Reads the 'best' uberblock from disk along with its associated * configuration. First, we read the uberblock array of each label of each * vdev, keeping track of the uberblock with the highest txg in each array. * Then, we read the configuration from the same vdev as the best uberblock. */ void vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) { zio_t *zio; spa_t *spa = rvd->vdev_spa; struct ubl_cbdata cb; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; ASSERT(ub); ASSERT(config); memset(ub, 0, sizeof (uberblock_t)); memset(&cb, 0, sizeof (cb)); *config = NULL; cb.ubl_ubbest = ub; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); zio = zio_root(spa, NULL, &cb, flags); vdev_uberblock_load_impl(zio, rvd, flags, &cb); (void) zio_wait(zio); /* * It's possible that the best uberblock was discovered on a label * that has a configuration which was written in a future txg. * Search all labels on this vdev to find the configuration that * matches the txg for our uberblock. */ if (cb.ubl_vd != NULL) { vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. " "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg); if (ub->ub_raidz_reflow_info != cb.ubl_latest.ub_raidz_reflow_info) { vdev_dbgmsg(cb.ubl_vd, "spa=%s best uberblock (txg=%llu info=0x%llx) " "has different raidz_reflow_info than latest " "uberblock (txg=%llu info=0x%llx)", spa->spa_name, (u_longlong_t)ub->ub_txg, (u_longlong_t)ub->ub_raidz_reflow_info, (u_longlong_t)cb.ubl_latest.ub_txg, (u_longlong_t)cb.ubl_latest.ub_raidz_reflow_info); memset(ub, 0, sizeof (uberblock_t)); spa_config_exit(spa, SCL_ALL, FTAG); return; } *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg); if (*config == NULL && spa->spa_extreme_rewind) { vdev_dbgmsg(cb.ubl_vd, "failed to read label config. " "Trying again without txg restrictions."); *config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX); } if (*config == NULL) { vdev_dbgmsg(cb.ubl_vd, "failed to read label config"); } } spa_config_exit(spa, SCL_ALL, FTAG); } /* * For use when a leaf vdev is expanded. * The location of labels 2 and 3 changed, and at the new location the * uberblock rings are either empty or contain garbage. The sync will write * new configs there because the vdev is dirty, but expansion also needs the * uberblock rings copied. Read them from label 0 which did not move. * * Since the point is to populate labels {2,3} with valid uberblocks, * we zero uberblocks we fail to read or which are not valid. */ static void vdev_copy_uberblocks(vdev_t *vd) { abd_t *ub_abd; zio_t *write_zio; int locks = (SCL_L2ARC | SCL_ZIO); int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_READER) == SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); /* * No uberblocks are stored on distributed spares, they may be * safely skipped when expanding a leaf vdev. */ if (vd->vdev_ops == &vdev_draid_spare_ops) return; spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER); ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); write_zio = zio_root(vd->vdev_spa, NULL, NULL, flags); for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { const int src_label = 0; zio_t *zio; zio = zio_root(vd->vdev_spa, NULL, NULL, flags); vdev_label_read(zio, vd, src_label, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags); if (zio_wait(zio) || uberblock_verify(abd_to_buf(ub_abd))) abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); for (int l = 2; l < VDEV_LABELS; l++) vdev_label_write(write_zio, vd, l, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags | ZIO_FLAG_DONT_PROPAGATE); } (void) zio_wait(write_zio); spa_config_exit(vd->vdev_spa, locks, FTAG); abd_free(ub_abd); } /* * On success, increment root zio's count of good writes. * We only get credit for writes to known-visible vdevs; see spa_vdev_add(). */ static void vdev_uberblock_sync_done(zio_t *zio) { uint64_t *good_writes = zio->io_private; if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0) atomic_inc_64(good_writes); } /* * Write the uberblock to all labels of all leaves of the specified vdev. */ static void vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, uberblock_t *ub, vdev_t *vd, int flags) { for (uint64_t c = 0; c < vd->vdev_children; c++) { vdev_uberblock_sync(zio, good_writes, ub, vd->vdev_child[c], flags); } if (!vd->vdev_ops->vdev_op_leaf) return; if (!vdev_writeable(vd)) return; /* * There's no need to write uberblocks to a distributed spare, they * are already stored on all the leaves of the parent dRAID. For * this same reason vdev_uberblock_load_impl() skips distributed * spares when reading uberblocks. */ if (vd->vdev_ops == &vdev_draid_spare_ops) return; /* If the vdev was expanded, need to copy uberblock rings. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && vd->vdev_copy_uberblocks == B_TRUE) { vdev_copy_uberblocks(vd); vd->vdev_copy_uberblocks = B_FALSE; } /* * We chose a slot based on the txg. If this uberblock has a special * RAIDZ expansion state, then it is essentially an update of the * current uberblock (it has the same txg). However, the current * state is committed, so we want to write it to a different slot. If * we overwrote the same slot, and we lose power during the uberblock * write, and the disk does not do single-sector overwrites * atomically (even though it is required to - i.e. we should see * either the old or the new uberblock), then we could lose this * txg's uberblock. Rewinding to the previous txg's uberblock may not * be possible because RAIDZ expansion may have already overwritten * some of the data, so we need the progress indicator in the * uberblock. */ int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0; int n = (ub->ub_txg - (RRSS_GET_STATE(ub) == RRSS_SCRATCH_VALID)) % (VDEV_UBERBLOCK_COUNT(vd) - m); /* Copy the uberblock_t into the ABD */ abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); abd_zero_off(ub_abd, sizeof (uberblock_t), VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t)); for (int l = 0; l < VDEV_LABELS; l++) vdev_label_write(zio, vd, l, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_sync_done, good_writes, flags | ZIO_FLAG_DONT_PROPAGATE); abd_free(ub_abd); } /* Sync the uberblocks to all vdevs in svd[] */ int vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) { spa_t *spa = svd[0]->vdev_spa; zio_t *zio; uint64_t good_writes = 0; zio = zio_root(spa, NULL, NULL, flags); for (int v = 0; v < svdcount; v++) vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags); if (spa->spa_aux_sync_uber) { for (int v = 0; v < spa->spa_spares.sav_count; v++) { vdev_uberblock_sync(zio, &good_writes, ub, spa->spa_spares.sav_vdevs[v], flags); } for (int v = 0; v < spa->spa_l2cache.sav_count; v++) { vdev_uberblock_sync(zio, &good_writes, ub, spa->spa_l2cache.sav_vdevs[v], flags); } } (void) zio_wait(zio); /* * Flush the uberblocks to disk. This ensures that the odd labels * are no longer needed (because the new uberblocks and the even * labels are safely on disk), so it is safe to overwrite them. */ zio = zio_root(spa, NULL, NULL, flags); for (int v = 0; v < svdcount; v++) { if (vdev_writeable(svd[v])) { zio_flush(zio, svd[v]); } } if (spa->spa_aux_sync_uber) { spa->spa_aux_sync_uber = B_FALSE; for (int v = 0; v < spa->spa_spares.sav_count; v++) { if (vdev_writeable(spa->spa_spares.sav_vdevs[v])) { zio_flush(zio, spa->spa_spares.sav_vdevs[v]); } } for (int v = 0; v < spa->spa_l2cache.sav_count; v++) { if (vdev_writeable(spa->spa_l2cache.sav_vdevs[v])) { zio_flush(zio, spa->spa_l2cache.sav_vdevs[v]); } } } (void) zio_wait(zio); return (good_writes >= 1 ? 0 : EIO); } /* * On success, increment the count of good writes for our top-level vdev. */ static void vdev_label_sync_done(zio_t *zio) { uint64_t *good_writes = zio->io_private; if (zio->io_error == 0) atomic_inc_64(good_writes); } /* * If there weren't enough good writes, indicate failure to the parent. */ static void vdev_label_sync_top_done(zio_t *zio) { uint64_t *good_writes = zio->io_private; if (*good_writes == 0) zio->io_error = SET_ERROR(EIO); kmem_free(good_writes, sizeof (uint64_t)); } /* * We ignore errors for log and cache devices, simply free the private data. */ static void vdev_label_sync_ignore_done(zio_t *zio) { kmem_free(zio->io_private, sizeof (uint64_t)); } /* * Write all even or odd labels to all leaves of the specified vdev. */ static void vdev_label_sync(zio_t *zio, uint64_t *good_writes, vdev_t *vd, int l, uint64_t txg, int flags) { nvlist_t *label; vdev_phys_t *vp; abd_t *vp_abd; char *buf; size_t buflen; vdev_t *pvd = vd->vdev_parent; boolean_t spare_in_use = B_FALSE; for (int c = 0; c < vd->vdev_children; c++) { vdev_label_sync(zio, good_writes, vd->vdev_child[c], l, txg, flags); } if (!vd->vdev_ops->vdev_op_leaf) return; if (!vdev_writeable(vd)) return; /* * The top-level config never needs to be written to a distributed * spare. When read vdev_dspare_label_read_config() will generate * the config for the vdev_label_read_config(). */ if (vd->vdev_ops == &vdev_draid_spare_ops) return; if (pvd && pvd->vdev_ops == &vdev_spare_ops) spare_in_use = B_TRUE; /* * Generate a label describing the top-level config to which we belong. */ if ((vd->vdev_isspare && !spare_in_use) || vd->vdev_isl2cache) { label = vdev_aux_label_generate(vd, vd->vdev_isspare); } else { label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); } vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); abd_zero(vp_abd, sizeof (vdev_phys_t)); vp = abd_to_buf(vp_abd); buf = vp->vp_nvlist; buflen = sizeof (vp->vp_nvlist); if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) { for (; l < VDEV_LABELS; l += 2) { vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), vdev_label_sync_done, good_writes, flags | ZIO_FLAG_DONT_PROPAGATE); } } abd_free(vp_abd); nvlist_free(label); } static int vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) { list_t *dl = &spa->spa_config_dirty_list; vdev_t *vd; zio_t *zio; int error; /* * Write the new labels to disk. */ zio = zio_root(spa, NULL, NULL, flags); for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { uint64_t *good_writes; ASSERT(!vd->vdev_ishole); good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); zio_t *vio = zio_null(zio, spa, NULL, (vd->vdev_islog || vd->vdev_aux != NULL) ? vdev_label_sync_ignore_done : vdev_label_sync_top_done, good_writes, flags); vdev_label_sync(vio, good_writes, vd, l, txg, flags); zio_nowait(vio); } /* * AUX path may have changed during import */ spa_aux_vdev_t *sav[2] = {&spa->spa_spares, &spa->spa_l2cache}; for (int i = 0; i < 2; i++) { for (int v = 0; v < sav[i]->sav_count; v++) { uint64_t *good_writes; if (!sav[i]->sav_label_sync) continue; good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); zio_t *vio = zio_null(zio, spa, NULL, vdev_label_sync_ignore_done, good_writes, flags); vdev_label_sync(vio, good_writes, sav[i]->sav_vdevs[v], l, txg, flags); zio_nowait(vio); } } error = zio_wait(zio); /* * Flush the new labels to disk. */ zio = zio_root(spa, NULL, NULL, flags); for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) zio_flush(zio, vd); for (int i = 0; i < 2; i++) { if (!sav[i]->sav_label_sync) continue; for (int v = 0; v < sav[i]->sav_count; v++) zio_flush(zio, sav[i]->sav_vdevs[v]); if (l == 1) sav[i]->sav_label_sync = B_FALSE; } (void) zio_wait(zio); return (error); } /* * Sync the uberblock and any changes to the vdev configuration. * * The order of operations is carefully crafted to ensure that * if the system panics or loses power at any time, the state on disk * is still transactionally consistent. The in-line comments below * describe the failure semantics at each stage. * * Moreover, vdev_config_sync() is designed to be idempotent: if it fails * at any time, you can just call it again, and it will resume its work. */ int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) { spa_t *spa = svd[0]->vdev_spa; uberblock_t *ub = &spa->spa_uberblock; int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; ASSERT(svdcount != 0); retry: /* * Normally, we don't want to try too hard to write every label and * uberblock. If there is a flaky disk, we don't want the rest of the * sync process to block while we retry. But if we can't write a * single label out, we should retry with ZIO_FLAG_TRYHARD before * bailing out and declaring the pool faulted. */ if (error != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) return (error); flags |= ZIO_FLAG_TRYHARD; } ASSERT(ub->ub_txg <= txg); /* * If this isn't a resync due to I/O errors, * and nothing changed in this transaction group, * and multihost protection isn't enabled, * and the vdev configuration hasn't changed, * then there's nothing to do. */ if (ub->ub_txg < txg) { boolean_t changed = uberblock_update(ub, spa->spa_root_vdev, txg, spa->spa_mmp.mmp_delay); if (!changed && list_is_empty(&spa->spa_config_dirty_list) && !spa_multihost(spa)) return (0); } if (txg > spa_freeze_txg(spa)) return (0); ASSERT(txg <= spa->spa_final_txg); /* * Flush the write cache of every disk that's been written to * in this transaction group. This ensures that all blocks * written in this txg will be committed to stable storage * before any uberblock that references them. */ zio_t *zio = zio_root(spa, NULL, NULL, flags); for (vdev_t *vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL; vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) zio_flush(zio, vd); (void) zio_wait(zio); /* * Sync out the even labels (L0, L2) for every dirty vdev. If the * system dies in the middle of this process, that's OK: all of the * even labels that made it to disk will be newer than any uberblock, * and will therefore be considered invalid. The odd labels (L1, L3), * which have not yet been touched, will still be valid. We flush * the new labels to disk to ensure that all even-label updates * are committed to stable storage before the uberblock update. */ if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) { zfs_dbgmsg("vdev_label_sync_list() returned error %d " "for pool '%s' when syncing out the even labels " "of dirty vdevs", error, spa_name(spa)); } goto retry; } /* * Sync the uberblocks to all vdevs in svd[]. * If the system dies in the middle of this step, there are two cases * to consider, and the on-disk state is consistent either way: * * (1) If none of the new uberblocks made it to disk, then the * previous uberblock will be the newest, and the odd labels * (which had not yet been touched) will be valid with respect * to that uberblock. * * (2) If one or more new uberblocks made it to disk, then they * will be the newest, and the even labels (which had all * been successfully committed) will be valid with respect * to the new uberblocks. */ if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) { zfs_dbgmsg("vdev_uberblock_sync_list() returned error " "%d for pool '%s'", error, spa_name(spa)); } goto retry; } if (spa_multihost(spa)) mmp_update_uberblock(spa, ub); /* * Sync out odd labels for every dirty vdev. If the system dies * in the middle of this process, the even labels and the new * uberblocks will suffice to open the pool. The next time * the pool is opened, the first thing we'll do -- before any * user data is modified -- is mark every vdev dirty so that * all labels will be brought up to date. We flush the new labels * to disk to ensure that all odd-label updates are committed to * stable storage before the next transaction group begins. */ if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) { zfs_dbgmsg("vdev_label_sync_list() returned error %d " "for pool '%s' when syncing out the odd labels of " "dirty vdevs", error, spa_name(spa)); } goto retry; } return (0); }