Index: vendor-sys/openzfs/dist/cmd/zfs/zfs_main.c =================================================================== --- vendor-sys/openzfs/dist/cmd/zfs/zfs_main.c (revision 365339) +++ vendor-sys/openzfs/dist/cmd/zfs/zfs_main.c (revision 365340) @@ -1,8620 +1,8637 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2012 Milan Jurik. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2016 Igor Kozhukhov . * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, loli10K * Copyright 2019 Joyent, Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_IDMAP #include #include #endif /* HAVE_IDMAP */ #include "zfs_iter.h" #include "zfs_util.h" #include "zfs_comutil.h" #include "libzfs_impl.h" #include "zfs_projectutil.h" libzfs_handle_t *g_zfs; static FILE *mnttab_file; static char history_str[HIS_MAX_RECORD_LEN]; static boolean_t log_history = B_TRUE; static int zfs_do_clone(int argc, char **argv); static int zfs_do_create(int argc, char **argv); static int zfs_do_destroy(int argc, char **argv); static int zfs_do_get(int argc, char **argv); static int zfs_do_inherit(int argc, char **argv); static int zfs_do_list(int argc, char **argv); static int zfs_do_mount(int argc, char **argv); static int zfs_do_rename(int argc, char **argv); static int zfs_do_rollback(int argc, char **argv); static int zfs_do_set(int argc, char **argv); static int zfs_do_upgrade(int argc, char **argv); static int zfs_do_snapshot(int argc, char **argv); static int zfs_do_unmount(int argc, char **argv); static int zfs_do_share(int argc, char **argv); static int zfs_do_unshare(int argc, char **argv); static int zfs_do_send(int argc, char **argv); static int zfs_do_receive(int argc, char **argv); static int zfs_do_promote(int argc, char **argv); static int zfs_do_userspace(int argc, char **argv); static int zfs_do_allow(int argc, char **argv); static int zfs_do_unallow(int argc, char **argv); static int zfs_do_hold(int argc, char **argv); static int zfs_do_holds(int argc, char **argv); static int zfs_do_release(int argc, char **argv); static int zfs_do_diff(int argc, char **argv); static int zfs_do_bookmark(int argc, char **argv); static int zfs_do_channel_program(int argc, char **argv); static int zfs_do_load_key(int argc, char **argv); static int zfs_do_unload_key(int argc, char **argv); static int zfs_do_change_key(int argc, char **argv); static int zfs_do_project(int argc, char **argv); static int zfs_do_version(int argc, char **argv); static int zfs_do_redact(int argc, char **argv); static int zfs_do_wait(int argc, char **argv); #ifdef __FreeBSD__ static int zfs_do_jail(int argc, char **argv); static int zfs_do_unjail(int argc, char **argv); #endif /* * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. */ #ifdef DEBUG const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } #endif typedef enum { HELP_CLONE, HELP_CREATE, HELP_DESTROY, HELP_GET, HELP_INHERIT, HELP_UPGRADE, HELP_LIST, HELP_MOUNT, HELP_PROMOTE, HELP_RECEIVE, HELP_RENAME, HELP_ROLLBACK, HELP_SEND, HELP_SET, HELP_SHARE, HELP_SNAPSHOT, HELP_UNMOUNT, HELP_UNSHARE, HELP_ALLOW, HELP_UNALLOW, HELP_USERSPACE, HELP_GROUPSPACE, HELP_PROJECTSPACE, HELP_PROJECT, HELP_HOLD, HELP_HOLDS, HELP_RELEASE, HELP_DIFF, HELP_BOOKMARK, HELP_CHANNEL_PROGRAM, HELP_LOAD_KEY, HELP_UNLOAD_KEY, HELP_CHANGE_KEY, HELP_VERSION, HELP_REDACT, HELP_JAIL, HELP_UNJAIL, HELP_WAIT, } zfs_help_t; typedef struct zfs_command { const char *name; int (*func)(int argc, char **argv); zfs_help_t usage; } zfs_command_t; /* * Master command table. Each ZFS command has a name, associated function, and * usage message. The usage messages need to be internationalized, so we have * to have a function to return the usage message based on a command index. * * These commands are organized according to how they are displayed in the usage * message. An empty command (one with a NULL name) indicates an empty line in * the generic usage message. */ static zfs_command_t command_table[] = { { "version", zfs_do_version, HELP_VERSION }, { NULL }, { "create", zfs_do_create, HELP_CREATE }, { "destroy", zfs_do_destroy, HELP_DESTROY }, { NULL }, { "snapshot", zfs_do_snapshot, HELP_SNAPSHOT }, { "rollback", zfs_do_rollback, HELP_ROLLBACK }, { "clone", zfs_do_clone, HELP_CLONE }, { "promote", zfs_do_promote, HELP_PROMOTE }, { "rename", zfs_do_rename, HELP_RENAME }, { "bookmark", zfs_do_bookmark, HELP_BOOKMARK }, { "program", zfs_do_channel_program, HELP_CHANNEL_PROGRAM }, { NULL }, { "list", zfs_do_list, HELP_LIST }, { NULL }, { "set", zfs_do_set, HELP_SET }, { "get", zfs_do_get, HELP_GET }, { "inherit", zfs_do_inherit, HELP_INHERIT }, { "upgrade", zfs_do_upgrade, HELP_UPGRADE }, { NULL }, { "userspace", zfs_do_userspace, HELP_USERSPACE }, { "groupspace", zfs_do_userspace, HELP_GROUPSPACE }, { "projectspace", zfs_do_userspace, HELP_PROJECTSPACE }, { NULL }, { "project", zfs_do_project, HELP_PROJECT }, { NULL }, { "mount", zfs_do_mount, HELP_MOUNT }, { "unmount", zfs_do_unmount, HELP_UNMOUNT }, { "share", zfs_do_share, HELP_SHARE }, { "unshare", zfs_do_unshare, HELP_UNSHARE }, { NULL }, { "send", zfs_do_send, HELP_SEND }, { "receive", zfs_do_receive, HELP_RECEIVE }, { NULL }, { "allow", zfs_do_allow, HELP_ALLOW }, { NULL }, { "unallow", zfs_do_unallow, HELP_UNALLOW }, { NULL }, { "hold", zfs_do_hold, HELP_HOLD }, { "holds", zfs_do_holds, HELP_HOLDS }, { "release", zfs_do_release, HELP_RELEASE }, { "diff", zfs_do_diff, HELP_DIFF }, { "load-key", zfs_do_load_key, HELP_LOAD_KEY }, { "unload-key", zfs_do_unload_key, HELP_UNLOAD_KEY }, { "change-key", zfs_do_change_key, HELP_CHANGE_KEY }, { "redact", zfs_do_redact, HELP_REDACT }, { "wait", zfs_do_wait, HELP_WAIT }, #ifdef __FreeBSD__ { "jail", zfs_do_jail, HELP_JAIL }, { "unjail", zfs_do_unjail, HELP_UNJAIL }, #endif }; #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) zfs_command_t *current_command; static const char * get_usage(zfs_help_t idx) { switch (idx) { case HELP_CLONE: return (gettext("\tclone [-p] [-o property=value] ... " " \n")); case HELP_CREATE: return (gettext("\tcreate [-Pnpv] [-o property=value] ... " "\n" "\tcreate [-Pnpsv] [-b blocksize] [-o property=value] ... " "-V \n")); case HELP_DESTROY: return (gettext("\tdestroy [-fnpRrv] \n" "\tdestroy [-dnpRrv] " "@[%][,...]\n" "\tdestroy #\n")); case HELP_GET: return (gettext("\tget [-rHp] [-d max] " "[-o \"all\" | field[,...]]\n" "\t [-t type[,...]] [-s source[,...]]\n" "\t <\"all\" | property[,...]> " "[filesystem|volume|snapshot|bookmark] ...\n")); case HELP_INHERIT: return (gettext("\tinherit [-rS] " " ...\n")); case HELP_UPGRADE: return (gettext("\tupgrade [-v]\n" "\tupgrade [-r] [-V version] <-a | filesystem ...>\n")); case HELP_LIST: return (gettext("\tlist [-Hp] [-r|-d max] [-o property[,...]] " "[-s property]...\n\t [-S property]... [-t type[,...]] " "[filesystem|volume|snapshot] ...\n")); case HELP_MOUNT: return (gettext("\tmount\n" "\tmount [-flvO] [-o opts] <-a | filesystem>\n")); case HELP_PROMOTE: return (gettext("\tpromote \n")); case HELP_RECEIVE: return (gettext("\treceive [-vMnsFhu] " "[-o =] ... [-x ] ...\n" "\t \n" "\treceive [-vMnsFhu] [-o =] ... " "[-x ] ... \n" "\t [-d | -e] \n" "\treceive -A \n")); case HELP_RENAME: return (gettext("\trename [-f] " "\n" - "\trename [-f] -p \n" + "\trename -p [-f] \n" + "\trename -u [-f] \n" "\trename -r \n")); case HELP_ROLLBACK: return (gettext("\trollback [-rRf] \n")); case HELP_SEND: return (gettext("\tsend [-DnPpRvLecwhb] [-[i|I] snapshot] " "\n" "\tsend [-nvPLecw] [-i snapshot|bookmark] " "\n" "\tsend [-DnPpvLec] [-i bookmark|snapshot] " "--redact \n" "\tsend [-nvPe] -t \n" "\tsend [-Pnv] --saved filesystem\n")); case HELP_SET: return (gettext("\tset ... " " ...\n")); case HELP_SHARE: return (gettext("\tshare [-l] <-a [nfs|smb] | filesystem>\n")); case HELP_SNAPSHOT: return (gettext("\tsnapshot [-r] [-o property=value] ... " "@ ...\n")); case HELP_UNMOUNT: return (gettext("\tunmount [-fu] " "<-a | filesystem|mountpoint>\n")); case HELP_UNSHARE: return (gettext("\tunshare " "<-a [nfs|smb] | filesystem|mountpoint>\n")); case HELP_ALLOW: return (gettext("\tallow \n" "\tallow [-ldug] " "<\"everyone\"|user|group>[,...] [,...]\n" "\t \n" "\tallow [-ld] -e [,...] " "\n" "\tallow -c [,...] \n" "\tallow -s @setname [,...] " "\n")); case HELP_UNALLOW: return (gettext("\tunallow [-rldug] " "<\"everyone\"|user|group>[,...]\n" "\t [[,...]] \n" "\tunallow [-rld] -e [[,...]] " "\n" "\tunallow [-r] -c [[,...]] " "\n" "\tunallow [-r] -s @setname [[,...]] " "\n")); case HELP_USERSPACE: return (gettext("\tuserspace [-Hinp] [-o field[,...]] " "[-s field] ...\n" "\t [-S field] ... [-t type[,...]] " "\n")); case HELP_GROUPSPACE: return (gettext("\tgroupspace [-Hinp] [-o field[,...]] " "[-s field] ...\n" "\t [-S field] ... [-t type[,...]] " "\n")); case HELP_PROJECTSPACE: return (gettext("\tprojectspace [-Hp] [-o field[,...]] " "[-s field] ... \n" "\t [-S field] ... \n")); case HELP_PROJECT: return (gettext("\tproject [-d|-r] \n" "\tproject -c [-0] [-d|-r] [-p id] \n" "\tproject -C [-k] [-r] \n" "\tproject [-p id] [-r] [-s] \n")); case HELP_HOLD: return (gettext("\thold [-r] ...\n")); case HELP_HOLDS: return (gettext("\tholds [-rH] ...\n")); case HELP_RELEASE: return (gettext("\trelease [-r] ...\n")); case HELP_DIFF: return (gettext("\tdiff [-FHt] " "[snapshot|filesystem]\n")); case HELP_BOOKMARK: return (gettext("\tbookmark " "\n")); case HELP_CHANNEL_PROGRAM: return (gettext("\tprogram [-jn] [-t ] " "[-m ]\n" "\t [lua args...]\n")); case HELP_LOAD_KEY: return (gettext("\tload-key [-rn] [-L ] " "<-a | filesystem|volume>\n")); case HELP_UNLOAD_KEY: return (gettext("\tunload-key [-r] " "<-a | filesystem|volume>\n")); case HELP_CHANGE_KEY: return (gettext("\tchange-key [-l] [-o keyformat=]\n" - "\t [-o keylocation=] [-o pbkfd2iters=]\n" + "\t [-o keylocation=] [-o pbkdf2iters=]\n" "\t \n" "\tchange-key -i [-l] \n")); case HELP_VERSION: return (gettext("\tversion\n")); case HELP_REDACT: return (gettext("\tredact " " ...\n")); case HELP_JAIL: return (gettext("\tjail \n")); case HELP_UNJAIL: return (gettext("\tunjail \n")); case HELP_WAIT: return (gettext("\twait [-t ] \n")); } abort(); /* NOTREACHED */ } void nomem(void) { (void) fprintf(stderr, gettext("internal error: out of memory\n")); exit(1); } /* * Utility function to guarantee malloc() success. */ void * safe_malloc(size_t size) { void *data; if ((data = calloc(1, size)) == NULL) nomem(); return (data); } static void * safe_realloc(void *data, size_t size) { void *newp; if ((newp = realloc(data, size)) == NULL) { free(data); nomem(); } return (newp); } static char * safe_strdup(char *str) { char *dupstr = strdup(str); if (dupstr == NULL) nomem(); return (dupstr); } /* * Callback routine that will print out information for each of * the properties. */ static int usage_prop_cb(int prop, void *cb) { FILE *fp = cb; (void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop)); if (zfs_prop_readonly(prop)) (void) fprintf(fp, " NO "); else (void) fprintf(fp, "YES "); if (zfs_prop_inheritable(prop)) (void) fprintf(fp, " YES "); else (void) fprintf(fp, " NO "); if (zfs_prop_values(prop) == NULL) (void) fprintf(fp, "-\n"); else (void) fprintf(fp, "%s\n", zfs_prop_values(prop)); return (ZPROP_CONT); } /* * Display usage message. If we're inside a command, display only the usage for * that command. Otherwise, iterate over the entire command table and display * a complete usage message. */ static void usage(boolean_t requested) { int i; boolean_t show_properties = B_FALSE; FILE *fp = requested ? stdout : stderr; if (current_command == NULL) { (void) fprintf(fp, gettext("usage: zfs command args ...\n")); (void) fprintf(fp, gettext("where 'command' is one of the following:\n\n")); for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) (void) fprintf(fp, "\n"); else (void) fprintf(fp, "%s", get_usage(command_table[i].usage)); } (void) fprintf(fp, gettext("\nEach dataset is of the form: " "pool/[dataset/]*dataset[@name]\n")); } else { (void) fprintf(fp, gettext("usage:\n")); (void) fprintf(fp, "%s", get_usage(current_command->usage)); } if (current_command != NULL && (strcmp(current_command->name, "set") == 0 || strcmp(current_command->name, "get") == 0 || strcmp(current_command->name, "inherit") == 0 || strcmp(current_command->name, "list") == 0)) show_properties = B_TRUE; if (show_properties) { (void) fprintf(fp, gettext("\nThe following properties are supported:\n")); (void) fprintf(fp, "\n\t%-14s %s %s %s\n\n", "PROPERTY", "EDIT", "INHERIT", "VALUES"); /* Iterate over all properties */ (void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_DATASET); (void) fprintf(fp, "\t%-15s ", "userused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "groupused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "projectused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "userobjused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "groupobjused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "projectobjused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "userquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "groupquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "projectquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "userobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "groupobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "projectobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "written@"); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "written#"); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, gettext("\nSizes are specified in bytes " "with standard units such as K, M, G, etc.\n")); (void) fprintf(fp, gettext("\nUser-defined properties can " "be specified by using a name containing a colon (:).\n")); (void) fprintf(fp, gettext("\nThe {user|group|project}" "[obj]{used|quota}@ properties must be appended with\n" "a user|group|project specifier of one of these forms:\n" " POSIX name (eg: \"matt\")\n" " POSIX id (eg: \"126829\")\n" " SMB name@domain (eg: \"matt@sun\")\n" " SMB SID (eg: \"S-1-234-567-89\")\n")); } else { (void) fprintf(fp, gettext("\nFor the property list, run: %s\n"), "zfs set|get"); (void) fprintf(fp, gettext("\nFor the delegated permission list, run: %s\n"), "zfs allow|unallow"); } /* * See comments at end of main(). */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } exit(requested ? 0 : 2); } /* * Take a property=value argument string and add it to the given nvlist. * Modifies the argument inplace. */ static boolean_t parseprop(nvlist_t *props, char *propname) { char *propval; if ((propval = strchr(propname, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for property=value argument\n")); return (B_FALSE); } *propval = '\0'; propval++; if (nvlist_exists(props, propname)) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (B_FALSE); } if (nvlist_add_string(props, propname, propval) != 0) nomem(); return (B_TRUE); } /* * Take a property name argument and add it to the given nvlist. * Modifies the argument inplace. */ static boolean_t parsepropname(nvlist_t *props, char *propname) { if (strchr(propname, '=') != NULL) { (void) fprintf(stderr, gettext("invalid character " "'=' in property argument\n")); return (B_FALSE); } if (nvlist_exists(props, propname)) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (B_FALSE); } if (nvlist_add_boolean(props, propname) != 0) nomem(); return (B_TRUE); } static int parse_depth(char *opt, int *flags) { char *tmp; int depth; depth = (int)strtol(opt, &tmp, 0); if (*tmp) { (void) fprintf(stderr, gettext("%s is not an integer\n"), optarg); usage(B_FALSE); } if (depth < 0) { (void) fprintf(stderr, gettext("Depth can not be negative.\n")); usage(B_FALSE); } *flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE); return (depth); } #define PROGRESS_DELAY 2 /* seconds */ static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"; static time_t pt_begin; static char *pt_header = NULL; static boolean_t pt_shown; static void start_progress_timer(void) { pt_begin = time(NULL) + PROGRESS_DELAY; pt_shown = B_FALSE; } static void set_progress_header(char *header) { assert(pt_header == NULL); pt_header = safe_strdup(header); if (pt_shown) { (void) printf("%s: ", header); (void) fflush(stdout); } } static void update_progress(char *update) { if (!pt_shown && time(NULL) > pt_begin) { int len = strlen(update); (void) printf("%s: %s%*.*s", pt_header, update, len, len, pt_reverse); (void) fflush(stdout); pt_shown = B_TRUE; } else if (pt_shown) { int len = strlen(update); (void) printf("%s%*.*s", update, len, len, pt_reverse); (void) fflush(stdout); } } static void finish_progress(char *done) { if (pt_shown) { (void) printf("%s\n", done); (void) fflush(stdout); } free(pt_header); pt_header = NULL; } static int zfs_mount_and_share(libzfs_handle_t *hdl, const char *dataset, zfs_type_t type) { zfs_handle_t *zhp = NULL; int ret = 0; zhp = zfs_open(hdl, dataset, type); if (zhp == NULL) return (1); /* * Volumes may neither be mounted or shared. Potentially in the * future filesystems detected on these volumes could be mounted. */ if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME) { zfs_close(zhp); return (0); } /* * Mount and/or share the new filesystem as appropriate. We provide a * verbose error message to let the user know that their filesystem was * in fact created, even if we failed to mount or share it. * * If the user doesn't want the dataset automatically mounted, then * skip the mount/share step */ if (zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, type, B_FALSE) && zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON) { if (zfs_mount_delegation_check()) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but it may only be " "mounted by root\n")); ret = 1; } else if (zfs_mount(zhp, NULL, 0) != 0) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but not mounted\n")); ret = 1; } else if (zfs_share(zhp) != 0) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but not shared\n")); ret = 1; } zfs_commit_all_shares(); } zfs_close(zhp); return (ret); } /* * zfs clone [-p] [-o prop=value] ... * * Given an existing dataset, create a writable copy whose initial contents * are the same as the source. The newly created dataset maintains a * dependency on the original; the original cannot be destroyed so long as * the clone exists. * * The '-p' flag creates all the non-existing ancestors of the target first. */ static int zfs_do_clone(int argc, char **argv) { zfs_handle_t *zhp = NULL; boolean_t parents = B_FALSE; nvlist_t *props; int ret = 0; int c; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, "o:p")) != -1) { switch (c) { case 'o': if (!parseprop(props, optarg)) { nvlist_free(props); return (1); } break; case 'p': parents = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing source dataset " "argument\n")); goto usage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing target dataset " "argument\n")); goto usage; } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); goto usage; } /* open the source dataset */ if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) { nvlist_free(props); return (1); } if (parents && zfs_name_valid(argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { /* * Now create the ancestors of the target dataset. If the * target already exists and '-p' option was used we should not * complain. */ if (zfs_dataset_exists(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { zfs_close(zhp); nvlist_free(props); return (0); } if (zfs_create_ancestors(g_zfs, argv[1]) != 0) { zfs_close(zhp); nvlist_free(props); return (1); } } /* pass to libzfs */ ret = zfs_clone(zhp, argv[1], props); /* create the mountpoint if necessary */ if (ret == 0) { if (log_history) { (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } ret = zfs_mount_and_share(g_zfs, argv[1], ZFS_TYPE_DATASET); } zfs_close(zhp); nvlist_free(props); return (!!ret); usage: ASSERT3P(zhp, ==, NULL); nvlist_free(props); usage(B_FALSE); return (-1); } /* * zfs create [-Pnpv] [-o prop=value] ... fs * zfs create [-Pnpsv] [-b blocksize] [-o prop=value] ... -V vol size * * Create a new dataset. This command can be used to create filesystems * and volumes. Snapshot creation is handled by 'zfs snapshot'. * For volumes, the user must specify a size to be used. * * The '-s' flag applies only to volumes, and indicates that we should not try * to set the reservation for this volume. By default we set a reservation * equal to the size for any volume. For pools with SPA_VERSION >= * SPA_VERSION_REFRESERVATION, we set a refreservation instead. * * The '-p' flag creates all the non-existing ancestors of the target first. * * The '-n' flag is no-op (dry run) mode. This will perform a user-space sanity * check of arguments and properties, but does not check for permissions, * available space, etc. * * The '-v' flag is for verbose output. * * The '-P' flag is used for parseable output. It implies '-v'. */ static int zfs_do_create(int argc, char **argv) { zfs_type_t type = ZFS_TYPE_FILESYSTEM; zpool_handle_t *zpool_handle = NULL; nvlist_t *real_props = NULL; uint64_t volsize = 0; int c; boolean_t noreserve = B_FALSE; boolean_t bflag = B_FALSE; boolean_t parents = B_FALSE; boolean_t dryrun = B_FALSE; boolean_t verbose = B_FALSE; boolean_t parseable = B_FALSE; int ret = 1; nvlist_t *props; uint64_t intval; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, ":PV:b:nso:pv")) != -1) { switch (c) { case 'V': type = ZFS_TYPE_VOLUME; if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { (void) fprintf(stderr, gettext("bad volume " "size '%s': %s\n"), optarg, libzfs_error_description(g_zfs)); goto error; } if (nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0) nomem(); volsize = intval; break; case 'P': verbose = B_TRUE; parseable = B_TRUE; break; case 'p': parents = B_TRUE; break; case 'b': bflag = B_TRUE; if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { (void) fprintf(stderr, gettext("bad volume " "block size '%s': %s\n"), optarg, libzfs_error_description(g_zfs)); goto error; } if (nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), intval) != 0) nomem(); break; case 'n': dryrun = B_TRUE; break; case 'o': if (!parseprop(props, optarg)) goto error; break; case 's': noreserve = B_TRUE; break; case 'v': verbose = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing size " "argument\n")); goto badusage; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto badusage; } } if ((bflag || noreserve) && type != ZFS_TYPE_VOLUME) { (void) fprintf(stderr, gettext("'-s' and '-b' can only be " "used when creating a volume\n")); goto badusage; } argc -= optind; argv += optind; /* check number of arguments */ if (argc == 0) { (void) fprintf(stderr, gettext("missing %s argument\n"), zfs_type_to_name(type)); goto badusage; } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); goto badusage; } if (dryrun || (type == ZFS_TYPE_VOLUME && !noreserve)) { char msg[ZFS_MAX_DATASET_NAME_LEN * 2]; char *p; if ((p = strchr(argv[0], '/')) != NULL) *p = '\0'; zpool_handle = zpool_open(g_zfs, argv[0]); if (p != NULL) *p = '/'; if (zpool_handle == NULL) goto error; (void) snprintf(msg, sizeof (msg), dryrun ? gettext("cannot verify '%s'") : gettext("cannot create '%s'"), argv[0]); if (props && (real_props = zfs_valid_proplist(g_zfs, type, props, 0, NULL, zpool_handle, B_TRUE, msg)) == NULL) { zpool_close(zpool_handle); goto error; } } /* * if volsize is not a multiple of volblocksize, round it up to the * nearest multiple of the volblocksize */ if (type == ZFS_TYPE_VOLUME) { uint64_t volblocksize; if (nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) volblocksize = ZVOL_DEFAULT_BLOCKSIZE; if (volsize % volblocksize) { volsize = P2ROUNDUP_TYPED(volsize, volblocksize, uint64_t); if (nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLSIZE), volsize) != 0) { nvlist_free(props); nomem(); } } } if (type == ZFS_TYPE_VOLUME && !noreserve) { uint64_t spa_version; zfs_prop_t resv_prop; char *strval; spa_version = zpool_get_prop_int(zpool_handle, ZPOOL_PROP_VERSION, NULL); if (spa_version >= SPA_VERSION_REFRESERVATION) resv_prop = ZFS_PROP_REFRESERVATION; else resv_prop = ZFS_PROP_RESERVATION; volsize = zvol_volsize_to_reservation(zpool_handle, volsize, real_props); if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop), &strval) != 0) { if (nvlist_add_uint64(props, zfs_prop_to_name(resv_prop), volsize) != 0) { nvlist_free(props); nomem(); } } } if (zpool_handle != NULL) { zpool_close(zpool_handle); nvlist_free(real_props); } if (parents && zfs_name_valid(argv[0], type)) { /* * Now create the ancestors of target dataset. If the target * already exists and '-p' option was used we should not * complain. */ if (zfs_dataset_exists(g_zfs, argv[0], type)) { ret = 0; goto error; } if (verbose) { (void) printf(parseable ? "create_ancestors\t%s\n" : dryrun ? "would create ancestors of %s\n" : "create ancestors of %s\n", argv[0]); } if (!dryrun) { if (zfs_create_ancestors(g_zfs, argv[0]) != 0) { goto error; } } } if (verbose) { nvpair_t *nvp = NULL; (void) printf(parseable ? "create\t%s\n" : dryrun ? "would create %s\n" : "create %s\n", argv[0]); while ((nvp = nvlist_next_nvpair(props, nvp)) != NULL) { uint64_t uval; char *sval; switch (nvpair_type(nvp)) { case DATA_TYPE_UINT64: VERIFY0(nvpair_value_uint64(nvp, &uval)); (void) printf(parseable ? "property\t%s\t%llu\n" : "\t%s=%llu\n", nvpair_name(nvp), (u_longlong_t)uval); break; case DATA_TYPE_STRING: VERIFY0(nvpair_value_string(nvp, &sval)); (void) printf(parseable ? "property\t%s\t%s\n" : "\t%s=%s\n", nvpair_name(nvp), sval); break; default: (void) fprintf(stderr, "property '%s' " "has illegal type %d\n", nvpair_name(nvp), nvpair_type(nvp)); abort(); } } } if (dryrun) { ret = 0; goto error; } /* pass to libzfs */ if (zfs_create(g_zfs, argv[0], type, props) != 0) goto error; if (log_history) { (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } ret = zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET); error: nvlist_free(props); return (ret); badusage: nvlist_free(props); usage(B_FALSE); return (2); } /* * zfs destroy [-rRf] * zfs destroy [-rRd] * * -r Recursively destroy all children * -R Recursively destroy all dependents, including clones * -f Force unmounting of any dependents * -d If we can't destroy now, mark for deferred destruction * * Destroys the given dataset. By default, it will unmount any filesystems, * and refuse to destroy a dataset that has any dependents. A dependent can * either be a child, or a clone of a child. */ typedef struct destroy_cbdata { boolean_t cb_first; boolean_t cb_force; boolean_t cb_recurse; boolean_t cb_error; boolean_t cb_doclones; zfs_handle_t *cb_target; boolean_t cb_defer_destroy; boolean_t cb_verbose; boolean_t cb_parsable; boolean_t cb_dryrun; nvlist_t *cb_nvl; nvlist_t *cb_batchedsnaps; /* first snap in contiguous run */ char *cb_firstsnap; /* previous snap in contiguous run */ char *cb_prevsnap; int64_t cb_snapused; char *cb_snapspec; char *cb_bookmark; uint64_t cb_snap_count; } destroy_cbdata_t; /* * Check for any dependents based on the '-r' or '-R' flags. */ static int destroy_check_dependent(zfs_handle_t *zhp, void *data) { destroy_cbdata_t *cbp = data; const char *tname = zfs_get_name(cbp->cb_target); const char *name = zfs_get_name(zhp); if (strncmp(tname, name, strlen(tname)) == 0 && (name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) { /* * This is a direct descendant, not a clone somewhere else in * the hierarchy. */ if (cbp->cb_recurse) goto out; if (cbp->cb_first) { (void) fprintf(stderr, gettext("cannot destroy '%s': " "%s has children\n"), zfs_get_name(cbp->cb_target), zfs_type_to_name(zfs_get_type(cbp->cb_target))); (void) fprintf(stderr, gettext("use '-r' to destroy " "the following datasets:\n")); cbp->cb_first = B_FALSE; cbp->cb_error = B_TRUE; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); } else { /* * This is a clone. We only want to report this if the '-r' * wasn't specified, or the target is a snapshot. */ if (!cbp->cb_recurse && zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT) goto out; if (cbp->cb_first) { (void) fprintf(stderr, gettext("cannot destroy '%s': " "%s has dependent clones\n"), zfs_get_name(cbp->cb_target), zfs_type_to_name(zfs_get_type(cbp->cb_target))); (void) fprintf(stderr, gettext("use '-R' to destroy " "the following datasets:\n")); cbp->cb_first = B_FALSE; cbp->cb_error = B_TRUE; cbp->cb_dryrun = B_TRUE; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); } out: zfs_close(zhp); return (0); } static int destroy_batched(destroy_cbdata_t *cb) { int error = zfs_destroy_snaps_nvl(g_zfs, cb->cb_batchedsnaps, B_FALSE); fnvlist_free(cb->cb_batchedsnaps); cb->cb_batchedsnaps = fnvlist_alloc(); return (error); } static int destroy_callback(zfs_handle_t *zhp, void *data) { destroy_cbdata_t *cb = data; const char *name = zfs_get_name(zhp); int error; if (cb->cb_verbose) { if (cb->cb_parsable) { (void) printf("destroy\t%s\n", name); } else if (cb->cb_dryrun) { (void) printf(gettext("would destroy %s\n"), name); } else { (void) printf(gettext("will destroy %s\n"), name); } } /* * Ignore pools (which we've already flagged as an error before getting * here). */ if (strchr(zfs_get_name(zhp), '/') == NULL && zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { zfs_close(zhp); return (0); } if (cb->cb_dryrun) { zfs_close(zhp); return (0); } /* * We batch up all contiguous snapshots (even of different * filesystems) and destroy them with one ioctl. We can't * simply do all snap deletions and then all fs deletions, * because we must delete a clone before its origin. */ if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) { cb->cb_snap_count++; fnvlist_add_boolean(cb->cb_batchedsnaps, name); if (cb->cb_snap_count % 10 == 0 && cb->cb_defer_destroy) error = destroy_batched(cb); } else { error = destroy_batched(cb); if (error != 0 || zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 || zfs_destroy(zhp, cb->cb_defer_destroy) != 0) { zfs_close(zhp); /* * When performing a recursive destroy we ignore errors * so that the recursive destroy could continue * destroying past problem datasets */ if (cb->cb_recurse) { cb->cb_error = B_TRUE; return (0); } return (-1); } } zfs_close(zhp); return (0); } static int destroy_print_cb(zfs_handle_t *zhp, void *arg) { destroy_cbdata_t *cb = arg; const char *name = zfs_get_name(zhp); int err = 0; if (nvlist_exists(cb->cb_nvl, name)) { if (cb->cb_firstsnap == NULL) cb->cb_firstsnap = strdup(name); if (cb->cb_prevsnap != NULL) free(cb->cb_prevsnap); /* this snap continues the current range */ cb->cb_prevsnap = strdup(name); if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL) nomem(); if (cb->cb_verbose) { if (cb->cb_parsable) { (void) printf("destroy\t%s\n", name); } else if (cb->cb_dryrun) { (void) printf(gettext("would destroy %s\n"), name); } else { (void) printf(gettext("will destroy %s\n"), name); } } } else if (cb->cb_firstsnap != NULL) { /* end of this range */ uint64_t used = 0; err = lzc_snaprange_space(cb->cb_firstsnap, cb->cb_prevsnap, &used); cb->cb_snapused += used; free(cb->cb_firstsnap); cb->cb_firstsnap = NULL; free(cb->cb_prevsnap); cb->cb_prevsnap = NULL; } zfs_close(zhp); return (err); } static int destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb) { int err; assert(cb->cb_firstsnap == NULL); assert(cb->cb_prevsnap == NULL); err = zfs_iter_snapshots_sorted(fs_zhp, destroy_print_cb, cb, 0, 0); if (cb->cb_firstsnap != NULL) { uint64_t used = 0; if (err == 0) { err = lzc_snaprange_space(cb->cb_firstsnap, cb->cb_prevsnap, &used); } cb->cb_snapused += used; free(cb->cb_firstsnap); cb->cb_firstsnap = NULL; free(cb->cb_prevsnap); cb->cb_prevsnap = NULL; } return (err); } static int snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg) { destroy_cbdata_t *cb = arg; int err = 0; /* Check for clones. */ if (!cb->cb_doclones && !cb->cb_defer_destroy) { cb->cb_target = zhp; cb->cb_first = B_TRUE; err = zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, cb); } if (err == 0) { if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp))) nomem(); } zfs_close(zhp); return (err); } static int gather_snapshots(zfs_handle_t *zhp, void *arg) { destroy_cbdata_t *cb = arg; int err = 0; err = zfs_iter_snapspec(zhp, cb->cb_snapspec, snapshot_to_nvl_cb, cb); if (err == ENOENT) err = 0; if (err != 0) goto out; if (cb->cb_verbose) { err = destroy_print_snapshots(zhp, cb); if (err != 0) goto out; } if (cb->cb_recurse) err = zfs_iter_filesystems(zhp, gather_snapshots, cb); out: zfs_close(zhp); return (err); } static int destroy_clones(destroy_cbdata_t *cb) { nvpair_t *pair; for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL); pair != NULL; pair = nvlist_next_nvpair(cb->cb_nvl, pair)) { zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair), ZFS_TYPE_SNAPSHOT); if (zhp != NULL) { boolean_t defer = cb->cb_defer_destroy; int err; /* * We can't defer destroy non-snapshots, so set it to * false while destroying the clones. */ cb->cb_defer_destroy = B_FALSE; err = zfs_iter_dependents(zhp, B_FALSE, destroy_callback, cb); cb->cb_defer_destroy = defer; zfs_close(zhp); if (err != 0) return (err); } } return (0); } static int zfs_do_destroy(int argc, char **argv) { destroy_cbdata_t cb = { 0 }; int rv = 0; int err = 0; int c; zfs_handle_t *zhp = NULL; char *at, *pound; zfs_type_t type = ZFS_TYPE_DATASET; /* check options */ while ((c = getopt(argc, argv, "vpndfrR")) != -1) { switch (c) { case 'v': cb.cb_verbose = B_TRUE; break; case 'p': cb.cb_verbose = B_TRUE; cb.cb_parsable = B_TRUE; break; case 'n': cb.cb_dryrun = B_TRUE; break; case 'd': cb.cb_defer_destroy = B_TRUE; type = ZFS_TYPE_SNAPSHOT; break; case 'f': cb.cb_force = B_TRUE; break; case 'r': cb.cb_recurse = B_TRUE; break; case 'R': cb.cb_recurse = B_TRUE; cb.cb_doclones = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc == 0) { (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } at = strchr(argv[0], '@'); pound = strchr(argv[0], '#'); if (at != NULL) { /* Build the list of snaps to destroy in cb_nvl. */ cb.cb_nvl = fnvlist_alloc(); *at = '\0'; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { nvlist_free(cb.cb_nvl); return (1); } cb.cb_snapspec = at + 1; if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 || cb.cb_error) { rv = 1; goto out; } if (nvlist_empty(cb.cb_nvl)) { (void) fprintf(stderr, gettext("could not find any " "snapshots to destroy; check snapshot names.\n")); rv = 1; goto out; } if (cb.cb_verbose) { char buf[16]; zfs_nicebytes(cb.cb_snapused, buf, sizeof (buf)); if (cb.cb_parsable) { (void) printf("reclaim\t%llu\n", (u_longlong_t)cb.cb_snapused); } else if (cb.cb_dryrun) { (void) printf(gettext("would reclaim %s\n"), buf); } else { (void) printf(gettext("will reclaim %s\n"), buf); } } if (!cb.cb_dryrun) { if (cb.cb_doclones) { cb.cb_batchedsnaps = fnvlist_alloc(); err = destroy_clones(&cb); if (err == 0) { err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_batchedsnaps, B_FALSE); } if (err != 0) { rv = 1; goto out; } } if (err == 0) { err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl, cb.cb_defer_destroy); } } if (err != 0) rv = 1; } else if (pound != NULL) { int err; nvlist_t *nvl; if (cb.cb_dryrun) { (void) fprintf(stderr, "dryrun is not supported with bookmark\n"); return (-1); } if (cb.cb_defer_destroy) { (void) fprintf(stderr, "defer destroy is not supported with bookmark\n"); return (-1); } if (cb.cb_recurse) { (void) fprintf(stderr, "recursive is not supported with bookmark\n"); return (-1); } /* * Unfortunately, zfs_bookmark() doesn't honor the * casesensitivity setting. However, we can't simply * remove this check, because lzc_destroy_bookmarks() * ignores non-existent bookmarks, so this is necessary * to get a proper error message. */ if (!zfs_bookmark_exists(argv[0])) { (void) fprintf(stderr, gettext("bookmark '%s' " "does not exist.\n"), argv[0]); return (1); } nvl = fnvlist_alloc(); fnvlist_add_boolean(nvl, argv[0]); err = lzc_destroy_bookmarks(nvl, NULL); if (err != 0) { (void) zfs_standard_error(g_zfs, err, "cannot destroy bookmark"); } nvlist_free(nvl); return (err); } else { /* Open the given dataset */ if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL) return (1); cb.cb_target = zhp; /* * Perform an explicit check for pools before going any further. */ if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL && zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { (void) fprintf(stderr, gettext("cannot destroy '%s': " "operation does not apply to pools\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use 'zfs destroy -r " "%s' to destroy all datasets in the pool\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use 'zpool destroy %s' " "to destroy the pool itself\n"), zfs_get_name(zhp)); rv = 1; goto out; } /* * Check for any dependents and/or clones. */ cb.cb_first = B_TRUE; if (!cb.cb_doclones && zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, &cb) != 0) { rv = 1; goto out; } if (cb.cb_error) { rv = 1; goto out; } cb.cb_batchedsnaps = fnvlist_alloc(); if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0) { rv = 1; goto out; } /* * Do the real thing. The callback will close the * handle regardless of whether it succeeds or not. */ err = destroy_callback(zhp, &cb); zhp = NULL; if (err == 0) { err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_batchedsnaps, cb.cb_defer_destroy); } if (err != 0 || cb.cb_error == B_TRUE) rv = 1; } out: fnvlist_free(cb.cb_batchedsnaps); fnvlist_free(cb.cb_nvl); if (zhp != NULL) zfs_close(zhp); return (rv); } static boolean_t is_recvd_column(zprop_get_cbdata_t *cbp) { int i; zfs_get_column_t col; for (i = 0; i < ZFS_GET_NCOLS && (col = cbp->cb_columns[i]) != GET_COL_NONE; i++) if (col == GET_COL_RECVD) return (B_TRUE); return (B_FALSE); } /* * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...] * < all | property[,property]... > < fs | snap | vol > ... * * -r recurse over any child datasets * -H scripted mode. Headers are stripped, and fields are separated * by tabs instead of spaces. * -o Set of fields to display. One of "name,property,value, * received,source". Default is "name,property,value,source". * "all" is an alias for all five. * -s Set of sources to allow. One of * "local,default,inherited,received,temporary,none". Default is * all six. * -p Display values in parsable (literal) format. * * Prints properties for the given datasets. The user can control which * columns to display as well as which property types to allow. */ /* * Invoked to display the properties for a single dataset. */ static int get_callback(zfs_handle_t *zhp, void *data) { char buf[ZFS_MAXPROPLEN]; char rbuf[ZFS_MAXPROPLEN]; zprop_source_t sourcetype; char source[ZFS_MAX_DATASET_NAME_LEN]; zprop_get_cbdata_t *cbp = data; nvlist_t *user_props = zfs_get_user_props(zhp); zprop_list_t *pl = cbp->cb_proplist; nvlist_t *propval; char *strval; char *sourceval; boolean_t received = is_recvd_column(cbp); for (; pl != NULL; pl = pl->pl_next) { char *recvdval = NULL; /* * Skip the special fake placeholder. This will also skip over * the name property when 'all' is specified. */ if (pl->pl_prop == ZFS_PROP_NAME && pl == cbp->cb_proplist) continue; if (pl->pl_prop != ZPROP_INVAL) { if (zfs_prop_get(zhp, pl->pl_prop, buf, sizeof (buf), &sourcetype, source, sizeof (source), cbp->cb_literal) != 0) { if (pl->pl_all) continue; if (!zfs_prop_valid_for_type(pl->pl_prop, ZFS_TYPE_DATASET, B_FALSE)) { (void) fprintf(stderr, gettext("No such property '%s'\n"), zfs_prop_to_name(pl->pl_prop)); continue; } sourcetype = ZPROP_SRC_NONE; (void) strlcpy(buf, "-", sizeof (buf)); } if (received && (zfs_prop_get_recvd(zhp, zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf), cbp->cb_literal) == 0)) recvdval = rbuf; zprop_print_one_property(zfs_get_name(zhp), cbp, zfs_prop_to_name(pl->pl_prop), buf, sourcetype, source, recvdval); } else if (zfs_prop_userquota(pl->pl_user_prop)) { sourcetype = ZPROP_SRC_LOCAL; if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, buf, sizeof (buf), cbp->cb_literal) != 0) { sourcetype = ZPROP_SRC_NONE; (void) strlcpy(buf, "-", sizeof (buf)); } zprop_print_one_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, buf, sourcetype, source, NULL); } else if (zfs_prop_written(pl->pl_user_prop)) { sourcetype = ZPROP_SRC_LOCAL; if (zfs_prop_get_written(zhp, pl->pl_user_prop, buf, sizeof (buf), cbp->cb_literal) != 0) { sourcetype = ZPROP_SRC_NONE; (void) strlcpy(buf, "-", sizeof (buf)); } zprop_print_one_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, buf, sourcetype, source, NULL); } else { if (nvlist_lookup_nvlist(user_props, pl->pl_user_prop, &propval) != 0) { if (pl->pl_all) continue; sourcetype = ZPROP_SRC_NONE; strval = "-"; } else { verify(nvlist_lookup_string(propval, ZPROP_VALUE, &strval) == 0); verify(nvlist_lookup_string(propval, ZPROP_SOURCE, &sourceval) == 0); if (strcmp(sourceval, zfs_get_name(zhp)) == 0) { sourcetype = ZPROP_SRC_LOCAL; } else if (strcmp(sourceval, ZPROP_SOURCE_VAL_RECVD) == 0) { sourcetype = ZPROP_SRC_RECEIVED; } else { sourcetype = ZPROP_SRC_INHERITED; (void) strlcpy(source, sourceval, sizeof (source)); } } if (received && (zfs_prop_get_recvd(zhp, pl->pl_user_prop, rbuf, sizeof (rbuf), cbp->cb_literal) == 0)) recvdval = rbuf; zprop_print_one_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, strval, sourcetype, source, recvdval); } } return (0); } static int zfs_do_get(int argc, char **argv) { zprop_get_cbdata_t cb = { 0 }; int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS; int types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK; char *value, *fields; int ret = 0; int limit = 0; zprop_list_t fake_name = { 0 }; /* * Set up default columns and sources. */ cb.cb_sources = ZPROP_SRC_ALL; cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; cb.cb_type = ZFS_TYPE_DATASET; /* check options */ while ((c = getopt(argc, argv, ":d:o:s:rt:Hp")) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; break; case 'd': limit = parse_depth(optarg, &flags); break; case 'r': flags |= ZFS_ITER_RECURSE; break; case 'H': cb.cb_scripted = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case 'o': /* * Process the set of columns to display. We zero out * the structure to give us a blank slate. */ bzero(&cb.cb_columns, sizeof (cb.cb_columns)); i = 0; while (*optarg != '\0') { static char *col_subopts[] = { "name", "property", "value", "received", "source", "all", NULL }; if (i == ZFS_GET_NCOLS) { (void) fprintf(stderr, gettext("too " "many fields given to -o " "option\n")); usage(B_FALSE); } switch (getsubopt(&optarg, col_subopts, &value)) { case 0: cb.cb_columns[i++] = GET_COL_NAME; break; case 1: cb.cb_columns[i++] = GET_COL_PROPERTY; break; case 2: cb.cb_columns[i++] = GET_COL_VALUE; break; case 3: cb.cb_columns[i++] = GET_COL_RECVD; flags |= ZFS_ITER_RECVD_PROPS; break; case 4: cb.cb_columns[i++] = GET_COL_SOURCE; break; case 5: if (i > 0) { (void) fprintf(stderr, gettext("\"all\" conflicts " "with specific fields " "given to -o option\n")); usage(B_FALSE); } cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_RECVD; cb.cb_columns[4] = GET_COL_SOURCE; flags |= ZFS_ITER_RECVD_PROPS; i = ZFS_GET_NCOLS; break; default: (void) fprintf(stderr, gettext("invalid column name " "'%s'\n"), value); usage(B_FALSE); } } break; case 's': cb.cb_sources = 0; while (*optarg != '\0') { static char *source_subopts[] = { "local", "default", "inherited", "received", "temporary", "none", NULL }; switch (getsubopt(&optarg, source_subopts, &value)) { case 0: cb.cb_sources |= ZPROP_SRC_LOCAL; break; case 1: cb.cb_sources |= ZPROP_SRC_DEFAULT; break; case 2: cb.cb_sources |= ZPROP_SRC_INHERITED; break; case 3: cb.cb_sources |= ZPROP_SRC_RECEIVED; break; case 4: cb.cb_sources |= ZPROP_SRC_TEMPORARY; break; case 5: cb.cb_sources |= ZPROP_SRC_NONE; break; default: (void) fprintf(stderr, gettext("invalid source " "'%s'\n"), value); usage(B_FALSE); } } break; case 't': types = 0; flags &= ~ZFS_ITER_PROP_LISTSNAPS; while (*optarg != '\0') { static char *type_subopts[] = { "filesystem", "volume", "snapshot", "snap", "bookmark", "all", NULL }; switch (getsubopt(&optarg, type_subopts, &value)) { case 0: types |= ZFS_TYPE_FILESYSTEM; break; case 1: types |= ZFS_TYPE_VOLUME; break; case 2: case 3: types |= ZFS_TYPE_SNAPSHOT; break; case 4: types |= ZFS_TYPE_BOOKMARK; break; case 5: types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK; break; default: (void) fprintf(stderr, gettext("invalid type '%s'\n"), value); usage(B_FALSE); } } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing property " "argument\n")); usage(B_FALSE); } fields = argv[0]; /* * Handle users who want to get all snapshots or bookmarks * of a dataset (ex. 'zfs get -t snapshot refer '). */ if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) && argc > 1 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) { flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE); limit = 1; } if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) != 0) usage(B_FALSE); argc--; argv++; /* * As part of zfs_expand_proplist(), we keep track of the maximum column * width for each property. For the 'NAME' (and 'SOURCE') columns, we * need to know the maximum name length. However, the user likely did * not specify 'name' as one of the properties to fetch, so we need to * make sure we always include at least this property for * print_get_headers() to work properly. */ if (cb.cb_proplist != NULL) { fake_name.pl_prop = ZFS_PROP_NAME; fake_name.pl_width = strlen(gettext("NAME")); fake_name.pl_next = cb.cb_proplist; cb.cb_proplist = &fake_name; } cb.cb_first = B_TRUE; /* run for each object */ ret = zfs_for_each(argc, argv, flags, types, NULL, &cb.cb_proplist, limit, get_callback, &cb); if (cb.cb_proplist == &fake_name) zprop_free_list(fake_name.pl_next); else zprop_free_list(cb.cb_proplist); return (ret); } /* * inherit [-rS] ... * * -r Recurse over all children * -S Revert to received value, if any * * For each dataset specified on the command line, inherit the given property * from its parent. Inheriting a property at the pool level will cause it to * use the default value. The '-r' flag will recurse over all children, and is * useful for setting a property on a hierarchy-wide basis, regardless of any * local modifications for each dataset. */ typedef struct inherit_cbdata { const char *cb_propname; boolean_t cb_received; } inherit_cbdata_t; static int inherit_recurse_cb(zfs_handle_t *zhp, void *data) { inherit_cbdata_t *cb = data; zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname); /* * If we're doing it recursively, then ignore properties that * are not valid for this type of dataset. */ if (prop != ZPROP_INVAL && !zfs_prop_valid_for_type(prop, zfs_get_type(zhp), B_FALSE)) return (0); return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); } static int inherit_cb(zfs_handle_t *zhp, void *data) { inherit_cbdata_t *cb = data; return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); } static int zfs_do_inherit(int argc, char **argv) { int c; zfs_prop_t prop; inherit_cbdata_t cb = { 0 }; char *propname; int ret = 0; int flags = 0; boolean_t received = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "rS")) != -1) { switch (c) { case 'r': flags |= ZFS_ITER_RECURSE; break; case 'S': received = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing property argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } propname = argv[0]; argc--; argv++; if ((prop = zfs_name_to_prop(propname)) != ZPROP_INVAL) { if (zfs_prop_readonly(prop)) { (void) fprintf(stderr, gettext( "%s property is read-only\n"), propname); return (1); } if (!zfs_prop_inheritable(prop) && !received) { (void) fprintf(stderr, gettext("'%s' property cannot " "be inherited\n"), propname); if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION || prop == ZFS_PROP_REFQUOTA || prop == ZFS_PROP_REFRESERVATION) { (void) fprintf(stderr, gettext("use 'zfs set " "%s=none' to clear\n"), propname); (void) fprintf(stderr, gettext("use 'zfs " "inherit -S %s' to revert to received " "value\n"), propname); } return (1); } if (received && (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION)) { (void) fprintf(stderr, gettext("'%s' property cannot " "be reverted to a received value\n"), propname); return (1); } } else if (!zfs_prop_user(propname)) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), propname); usage(B_FALSE); } cb.cb_propname = propname; cb.cb_received = received; if (flags & ZFS_ITER_RECURSE) { ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL, NULL, 0, inherit_recurse_cb, &cb); } else { ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL, NULL, 0, inherit_cb, &cb); } return (ret); } typedef struct upgrade_cbdata { uint64_t cb_numupgraded; uint64_t cb_numsamegraded; uint64_t cb_numfailed; uint64_t cb_version; boolean_t cb_newer; boolean_t cb_foundone; char cb_lastfs[ZFS_MAX_DATASET_NAME_LEN]; } upgrade_cbdata_t; static int same_pool(zfs_handle_t *zhp, const char *name) { int len1 = strcspn(name, "/@"); const char *zhname = zfs_get_name(zhp); int len2 = strcspn(zhname, "/@"); if (len1 != len2) return (B_FALSE); return (strncmp(name, zhname, len1) == 0); } static int upgrade_list_callback(zfs_handle_t *zhp, void *data) { upgrade_cbdata_t *cb = data; int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); /* list if it's old/new */ if ((!cb->cb_newer && version < ZPL_VERSION) || (cb->cb_newer && version > ZPL_VERSION)) { char *str; if (cb->cb_newer) { str = gettext("The following filesystems are " "formatted using a newer software version and\n" "cannot be accessed on the current system.\n\n"); } else { str = gettext("The following filesystems are " "out of date, and can be upgraded. After being\n" "upgraded, these filesystems (and any 'zfs send' " "streams generated from\n" "subsequent snapshots) will no longer be " "accessible by older software versions.\n\n"); } if (!cb->cb_foundone) { (void) puts(str); (void) printf(gettext("VER FILESYSTEM\n")); (void) printf(gettext("--- ------------\n")); cb->cb_foundone = B_TRUE; } (void) printf("%2u %s\n", version, zfs_get_name(zhp)); } return (0); } static int upgrade_set_callback(zfs_handle_t *zhp, void *data) { upgrade_cbdata_t *cb = data; int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); int needed_spa_version; int spa_version; if (zfs_spa_version(zhp, &spa_version) < 0) return (-1); needed_spa_version = zfs_spa_version_map(cb->cb_version); if (needed_spa_version < 0) return (-1); if (spa_version < needed_spa_version) { /* can't upgrade */ (void) printf(gettext("%s: can not be " "upgraded; the pool version needs to first " "be upgraded\nto version %d\n\n"), zfs_get_name(zhp), needed_spa_version); cb->cb_numfailed++; return (0); } /* upgrade */ if (version < cb->cb_version) { char verstr[16]; (void) snprintf(verstr, sizeof (verstr), "%llu", (u_longlong_t)cb->cb_version); if (cb->cb_lastfs[0] && !same_pool(zhp, cb->cb_lastfs)) { /* * If they did "zfs upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (zfs_prop_set(zhp, "version", verstr) == 0) cb->cb_numupgraded++; else cb->cb_numfailed++; (void) strcpy(cb->cb_lastfs, zfs_get_name(zhp)); } else if (version > cb->cb_version) { /* can't downgrade */ (void) printf(gettext("%s: can not be downgraded; " "it is already at version %u\n"), zfs_get_name(zhp), version); cb->cb_numfailed++; } else { cb->cb_numsamegraded++; } return (0); } /* * zfs upgrade * zfs upgrade -v * zfs upgrade [-r] [-V ] <-a | filesystem> */ static int zfs_do_upgrade(int argc, char **argv) { boolean_t all = B_FALSE; boolean_t showversions = B_FALSE; int ret = 0; upgrade_cbdata_t cb = { 0 }; int c; int flags = ZFS_ITER_ARGS_CAN_BE_PATHS; /* check options */ while ((c = getopt(argc, argv, "rvV:a")) != -1) { switch (c) { case 'r': flags |= ZFS_ITER_RECURSE; break; case 'v': showversions = B_TRUE; break; case 'V': if (zfs_prop_string_to_index(ZFS_PROP_VERSION, optarg, &cb.cb_version) != 0) { (void) fprintf(stderr, gettext("invalid version %s\n"), optarg); usage(B_FALSE); } break; case 'a': all = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if ((!all && !argc) && ((flags & ZFS_ITER_RECURSE) | cb.cb_version)) usage(B_FALSE); if (showversions && (flags & ZFS_ITER_RECURSE || all || cb.cb_version || argc)) usage(B_FALSE); if ((all || argc) && (showversions)) usage(B_FALSE); if (all && argc) usage(B_FALSE); if (showversions) { /* Show info on available versions. */ (void) printf(gettext("The following filesystem versions are " "supported:\n\n")); (void) printf(gettext("VER DESCRIPTION\n")); (void) printf("--- -----------------------------------------" "---------------\n"); (void) printf(gettext(" 1 Initial ZFS filesystem version\n")); (void) printf(gettext(" 2 Enhanced directory entries\n")); (void) printf(gettext(" 3 Case insensitive and filesystem " "user identifier (FUID)\n")); (void) printf(gettext(" 4 userquota, groupquota " "properties\n")); (void) printf(gettext(" 5 System attributes\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases,\n")); (void) printf("see the ZFS Administration Guide.\n\n"); ret = 0; } else if (argc || all) { /* Upgrade filesystems */ if (cb.cb_version == 0) cb.cb_version = ZPL_VERSION; ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM, NULL, NULL, 0, upgrade_set_callback, &cb); (void) printf(gettext("%llu filesystems upgraded\n"), (u_longlong_t)cb.cb_numupgraded); if (cb.cb_numsamegraded) { (void) printf(gettext("%llu filesystems already at " "this version\n"), (u_longlong_t)cb.cb_numsamegraded); } if (cb.cb_numfailed != 0) ret = 1; } else { /* List old-version filesystems */ boolean_t found; (void) printf(gettext("This system is currently running " "ZFS filesystem version %llu.\n\n"), ZPL_VERSION); flags |= ZFS_ITER_RECURSE; ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, NULL, NULL, 0, upgrade_list_callback, &cb); found = cb.cb_foundone; cb.cb_foundone = B_FALSE; cb.cb_newer = B_TRUE; ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, NULL, NULL, 0, upgrade_list_callback, &cb); if (!cb.cb_foundone && !found) { (void) printf(gettext("All filesystems are " "formatted with the current version.\n")); } } return (ret); } /* * zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...] * [-S field [-S field]...] [-t type[,...]] filesystem | snapshot * zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...] * [-S field [-S field]...] [-t type[,...]] filesystem | snapshot * zfs projectspace [-Hp] [-o field[,...]] [-s field [-s field]...] * [-S field [-S field]...] filesystem | snapshot * * -H Scripted mode; elide headers and separate columns by tabs. * -i Translate SID to POSIX ID. * -n Print numeric ID instead of user/group name. * -o Control which fields to display. * -p Use exact (parsable) numeric output. * -s Specify sort columns, descending order. * -S Specify sort columns, ascending order. * -t Control which object types to display. * * Displays space consumed by, and quotas on, each user in the specified * filesystem or snapshot. */ /* us_field_types, us_field_hdr and us_field_names should be kept in sync */ enum us_field_types { USFIELD_TYPE, USFIELD_NAME, USFIELD_USED, USFIELD_QUOTA, USFIELD_OBJUSED, USFIELD_OBJQUOTA }; static char *us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA", "OBJUSED", "OBJQUOTA" }; static char *us_field_names[] = { "type", "name", "used", "quota", "objused", "objquota" }; #define USFIELD_LAST (sizeof (us_field_names) / sizeof (char *)) #define USTYPE_PSX_GRP (1 << 0) #define USTYPE_PSX_USR (1 << 1) #define USTYPE_SMB_GRP (1 << 2) #define USTYPE_SMB_USR (1 << 3) #define USTYPE_PROJ (1 << 4) #define USTYPE_ALL \ (USTYPE_PSX_GRP | USTYPE_PSX_USR | USTYPE_SMB_GRP | USTYPE_SMB_USR | \ USTYPE_PROJ) static int us_type_bits[] = { USTYPE_PSX_GRP, USTYPE_PSX_USR, USTYPE_SMB_GRP, USTYPE_SMB_USR, USTYPE_ALL }; static char *us_type_names[] = { "posixgroup", "posixuser", "smbgroup", "smbuser", "all" }; typedef struct us_node { nvlist_t *usn_nvl; uu_avl_node_t usn_avlnode; uu_list_node_t usn_listnode; } us_node_t; typedef struct us_cbdata { nvlist_t **cb_nvlp; uu_avl_pool_t *cb_avl_pool; uu_avl_t *cb_avl; boolean_t cb_numname; boolean_t cb_nicenum; boolean_t cb_sid2posix; zfs_userquota_prop_t cb_prop; zfs_sort_column_t *cb_sortcol; size_t cb_width[USFIELD_LAST]; } us_cbdata_t; static boolean_t us_populated = B_FALSE; typedef struct { zfs_sort_column_t *si_sortcol; boolean_t si_numname; } us_sort_info_t; static int us_field_index(char *field) { int i; for (i = 0; i < USFIELD_LAST; i++) { if (strcmp(field, us_field_names[i]) == 0) return (i); } return (-1); } static int us_compare(const void *larg, const void *rarg, void *unused) { const us_node_t *l = larg; const us_node_t *r = rarg; us_sort_info_t *si = (us_sort_info_t *)unused; zfs_sort_column_t *sortcol = si->si_sortcol; boolean_t numname = si->si_numname; nvlist_t *lnvl = l->usn_nvl; nvlist_t *rnvl = r->usn_nvl; int rc = 0; boolean_t lvb, rvb; for (; sortcol != NULL; sortcol = sortcol->sc_next) { char *lvstr = ""; char *rvstr = ""; uint32_t lv32 = 0; uint32_t rv32 = 0; uint64_t lv64 = 0; uint64_t rv64 = 0; zfs_prop_t prop = sortcol->sc_prop; const char *propname = NULL; boolean_t reverse = sortcol->sc_reverse; switch (prop) { case ZFS_PROP_TYPE: propname = "type"; (void) nvlist_lookup_uint32(lnvl, propname, &lv32); (void) nvlist_lookup_uint32(rnvl, propname, &rv32); if (rv32 != lv32) rc = (rv32 < lv32) ? 1 : -1; break; case ZFS_PROP_NAME: propname = "name"; if (numname) { compare_nums: (void) nvlist_lookup_uint64(lnvl, propname, &lv64); (void) nvlist_lookup_uint64(rnvl, propname, &rv64); if (rv64 != lv64) rc = (rv64 < lv64) ? 1 : -1; } else { if ((nvlist_lookup_string(lnvl, propname, &lvstr) == ENOENT) || (nvlist_lookup_string(rnvl, propname, &rvstr) == ENOENT)) { goto compare_nums; } rc = strcmp(lvstr, rvstr); } break; case ZFS_PROP_USED: case ZFS_PROP_QUOTA: if (!us_populated) break; if (prop == ZFS_PROP_USED) propname = "used"; else propname = "quota"; (void) nvlist_lookup_uint64(lnvl, propname, &lv64); (void) nvlist_lookup_uint64(rnvl, propname, &rv64); if (rv64 != lv64) rc = (rv64 < lv64) ? 1 : -1; break; default: break; } if (rc != 0) { if (rc < 0) return (reverse ? 1 : -1); else return (reverse ? -1 : 1); } } /* * If entries still seem to be the same, check if they are of the same * type (smbentity is added only if we are doing SID to POSIX ID * translation where we can have duplicate type/name combinations). */ if (nvlist_lookup_boolean_value(lnvl, "smbentity", &lvb) == 0 && nvlist_lookup_boolean_value(rnvl, "smbentity", &rvb) == 0 && lvb != rvb) return (lvb < rvb ? -1 : 1); return (0); } static boolean_t zfs_prop_is_user(unsigned p) { return (p == ZFS_PROP_USERUSED || p == ZFS_PROP_USERQUOTA || p == ZFS_PROP_USEROBJUSED || p == ZFS_PROP_USEROBJQUOTA); } static boolean_t zfs_prop_is_group(unsigned p) { return (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA || p == ZFS_PROP_GROUPOBJUSED || p == ZFS_PROP_GROUPOBJQUOTA); } static boolean_t zfs_prop_is_project(unsigned p) { return (p == ZFS_PROP_PROJECTUSED || p == ZFS_PROP_PROJECTQUOTA || p == ZFS_PROP_PROJECTOBJUSED || p == ZFS_PROP_PROJECTOBJQUOTA); } static inline const char * us_type2str(unsigned field_type) { switch (field_type) { case USTYPE_PSX_USR: return ("POSIX User"); case USTYPE_PSX_GRP: return ("POSIX Group"); case USTYPE_SMB_USR: return ("SMB User"); case USTYPE_SMB_GRP: return ("SMB Group"); case USTYPE_PROJ: return ("Project"); default: return ("Undefined"); } } static int userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space) { us_cbdata_t *cb = (us_cbdata_t *)arg; zfs_userquota_prop_t prop = cb->cb_prop; char *name = NULL; char *propname; char sizebuf[32]; us_node_t *node; uu_avl_pool_t *avl_pool = cb->cb_avl_pool; uu_avl_t *avl = cb->cb_avl; uu_avl_index_t idx; nvlist_t *props; us_node_t *n; zfs_sort_column_t *sortcol = cb->cb_sortcol; unsigned type = 0; const char *typestr; size_t namelen; size_t typelen; size_t sizelen; int typeidx, nameidx, sizeidx; us_sort_info_t sortinfo = { sortcol, cb->cb_numname }; boolean_t smbentity = B_FALSE; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); node = safe_malloc(sizeof (us_node_t)); uu_avl_node_init(node, &node->usn_avlnode, avl_pool); node->usn_nvl = props; if (domain != NULL && domain[0] != '\0') { #ifdef HAVE_IDMAP /* SMB */ char sid[MAXNAMELEN + 32]; uid_t id; uint64_t classes; int err; directory_error_t e; smbentity = B_TRUE; (void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid); if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) { type = USTYPE_SMB_GRP; err = sid_to_id(sid, B_FALSE, &id); } else { type = USTYPE_SMB_USR; err = sid_to_id(sid, B_TRUE, &id); } if (err == 0) { rid = id; if (!cb->cb_sid2posix) { e = directory_name_from_sid(NULL, sid, &name, &classes); if (e != NULL) directory_error_free(e); if (name == NULL) name = sid; } } #else nvlist_free(props); free(node); return (-1); #endif /* HAVE_IDMAP */ } if (cb->cb_sid2posix || domain == NULL || domain[0] == '\0') { /* POSIX or -i */ if (zfs_prop_is_group(prop)) { type = USTYPE_PSX_GRP; if (!cb->cb_numname) { struct group *g; if ((g = getgrgid(rid)) != NULL) name = g->gr_name; } } else if (zfs_prop_is_user(prop)) { type = USTYPE_PSX_USR; if (!cb->cb_numname) { struct passwd *p; if ((p = getpwuid(rid)) != NULL) name = p->pw_name; } } else { type = USTYPE_PROJ; } } /* * Make sure that the type/name combination is unique when doing * SID to POSIX ID translation (hence changing the type from SMB to * POSIX). */ if (cb->cb_sid2posix && nvlist_add_boolean_value(props, "smbentity", smbentity) != 0) nomem(); /* Calculate/update width of TYPE field */ typestr = us_type2str(type); typelen = strlen(gettext(typestr)); typeidx = us_field_index("type"); if (typelen > cb->cb_width[typeidx]) cb->cb_width[typeidx] = typelen; if (nvlist_add_uint32(props, "type", type) != 0) nomem(); /* Calculate/update width of NAME field */ if ((cb->cb_numname && cb->cb_sid2posix) || name == NULL) { if (nvlist_add_uint64(props, "name", rid) != 0) nomem(); namelen = snprintf(NULL, 0, "%u", rid); } else { if (nvlist_add_string(props, "name", name) != 0) nomem(); namelen = strlen(name); } nameidx = us_field_index("name"); if (nameidx >= 0 && namelen > cb->cb_width[nameidx]) cb->cb_width[nameidx] = namelen; /* * Check if this type/name combination is in the list and update it; * otherwise add new node to the list. */ if ((n = uu_avl_find(avl, node, &sortinfo, &idx)) == NULL) { uu_avl_insert(avl, node, idx); } else { nvlist_free(props); free(node); node = n; props = node->usn_nvl; } /* Calculate/update width of USED/QUOTA fields */ if (cb->cb_nicenum) { if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_USERQUOTA || prop == ZFS_PROP_GROUPQUOTA || prop == ZFS_PROP_PROJECTUSED || prop == ZFS_PROP_PROJECTQUOTA) { zfs_nicebytes(space, sizebuf, sizeof (sizebuf)); } else { zfs_nicenum(space, sizebuf, sizeof (sizebuf)); } } else { (void) snprintf(sizebuf, sizeof (sizebuf), "%llu", (u_longlong_t)space); } sizelen = strlen(sizebuf); if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_PROJECTUSED) { propname = "used"; if (!nvlist_exists(props, "quota")) (void) nvlist_add_uint64(props, "quota", 0); } else if (prop == ZFS_PROP_USERQUOTA || prop == ZFS_PROP_GROUPQUOTA || prop == ZFS_PROP_PROJECTQUOTA) { propname = "quota"; if (!nvlist_exists(props, "used")) (void) nvlist_add_uint64(props, "used", 0); } else if (prop == ZFS_PROP_USEROBJUSED || prop == ZFS_PROP_GROUPOBJUSED || prop == ZFS_PROP_PROJECTOBJUSED) { propname = "objused"; if (!nvlist_exists(props, "objquota")) (void) nvlist_add_uint64(props, "objquota", 0); } else if (prop == ZFS_PROP_USEROBJQUOTA || prop == ZFS_PROP_GROUPOBJQUOTA || prop == ZFS_PROP_PROJECTOBJQUOTA) { propname = "objquota"; if (!nvlist_exists(props, "objused")) (void) nvlist_add_uint64(props, "objused", 0); } else { return (-1); } sizeidx = us_field_index(propname); if (sizeidx >= 0 && sizelen > cb->cb_width[sizeidx]) cb->cb_width[sizeidx] = sizelen; if (nvlist_add_uint64(props, propname, space) != 0) nomem(); return (0); } static void print_us_node(boolean_t scripted, boolean_t parsable, int *fields, int types, size_t *width, us_node_t *node) { nvlist_t *nvl = node->usn_nvl; char valstr[MAXNAMELEN]; boolean_t first = B_TRUE; int cfield = 0; int field; uint32_t ustype; /* Check type */ (void) nvlist_lookup_uint32(nvl, "type", &ustype); if (!(ustype & types)) return; while ((field = fields[cfield]) != USFIELD_LAST) { nvpair_t *nvp = NULL; data_type_t type; uint32_t val32; uint64_t val64; char *strval = "-"; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { if (strcmp(nvpair_name(nvp), us_field_names[field]) == 0) break; } type = nvp == NULL ? DATA_TYPE_UNKNOWN : nvpair_type(nvp); switch (type) { case DATA_TYPE_UINT32: (void) nvpair_value_uint32(nvp, &val32); break; case DATA_TYPE_UINT64: (void) nvpair_value_uint64(nvp, &val64); break; case DATA_TYPE_STRING: (void) nvpair_value_string(nvp, &strval); break; case DATA_TYPE_UNKNOWN: break; default: (void) fprintf(stderr, "invalid data type\n"); } switch (field) { case USFIELD_TYPE: if (type == DATA_TYPE_UINT32) strval = (char *)us_type2str(val32); break; case USFIELD_NAME: if (type == DATA_TYPE_UINT64) { (void) sprintf(valstr, "%llu", (u_longlong_t)val64); strval = valstr; } break; case USFIELD_USED: case USFIELD_QUOTA: if (type == DATA_TYPE_UINT64) { if (parsable) { (void) sprintf(valstr, "%llu", (u_longlong_t)val64); strval = valstr; } else if (field == USFIELD_QUOTA && val64 == 0) { strval = "none"; } else { zfs_nicebytes(val64, valstr, sizeof (valstr)); strval = valstr; } } break; case USFIELD_OBJUSED: case USFIELD_OBJQUOTA: if (type == DATA_TYPE_UINT64) { if (parsable) { (void) sprintf(valstr, "%llu", (u_longlong_t)val64); strval = valstr; } else if (field == USFIELD_OBJQUOTA && val64 == 0) { strval = "none"; } else { zfs_nicenum(val64, valstr, sizeof (valstr)); strval = valstr; } } break; } if (!first) { if (scripted) (void) printf("\t"); else (void) printf(" "); } if (scripted) (void) printf("%s", strval); else if (field == USFIELD_TYPE || field == USFIELD_NAME) (void) printf("%-*s", (int)width[field], strval); else (void) printf("%*s", (int)width[field], strval); first = B_FALSE; cfield++; } (void) printf("\n"); } static void print_us(boolean_t scripted, boolean_t parsable, int *fields, int types, size_t *width, boolean_t rmnode, uu_avl_t *avl) { us_node_t *node; const char *col; int cfield = 0; int field; if (!scripted) { boolean_t first = B_TRUE; while ((field = fields[cfield]) != USFIELD_LAST) { col = gettext(us_field_hdr[field]); if (field == USFIELD_TYPE || field == USFIELD_NAME) { (void) printf(first ? "%-*s" : " %-*s", (int)width[field], col); } else { (void) printf(first ? "%*s" : " %*s", (int)width[field], col); } first = B_FALSE; cfield++; } (void) printf("\n"); } for (node = uu_avl_first(avl); node; node = uu_avl_next(avl, node)) { print_us_node(scripted, parsable, fields, types, width, node); if (rmnode) nvlist_free(node->usn_nvl); } } static int zfs_do_userspace(int argc, char **argv) { zfs_handle_t *zhp; zfs_userquota_prop_t p; uu_avl_pool_t *avl_pool; uu_avl_t *avl_tree; uu_avl_walk_t *walk; char *delim; char deffields[] = "type,name,used,quota,objused,objquota"; char *ofield = NULL; char *tfield = NULL; int cfield = 0; int fields[256]; int i; boolean_t scripted = B_FALSE; boolean_t prtnum = B_FALSE; boolean_t parsable = B_FALSE; boolean_t sid2posix = B_FALSE; int ret = 0; int c; zfs_sort_column_t *sortcol = NULL; int types = USTYPE_PSX_USR | USTYPE_SMB_USR; us_cbdata_t cb; us_node_t *node; us_node_t *rmnode; uu_list_pool_t *listpool; uu_list_t *list; uu_avl_index_t idx = 0; uu_list_index_t idx2 = 0; if (argc < 2) usage(B_FALSE); if (strcmp(argv[0], "groupspace") == 0) { /* Toggle default group types */ types = USTYPE_PSX_GRP | USTYPE_SMB_GRP; } else if (strcmp(argv[0], "projectspace") == 0) { types = USTYPE_PROJ; prtnum = B_TRUE; } while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) { switch (c) { case 'n': if (types == USTYPE_PROJ) { (void) fprintf(stderr, gettext("invalid option 'n'\n")); usage(B_FALSE); } prtnum = B_TRUE; break; case 'H': scripted = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 'o': ofield = optarg; break; case 's': case 'S': if (zfs_add_sort_column(&sortcol, optarg, c == 's' ? B_FALSE : B_TRUE) != 0) { (void) fprintf(stderr, gettext("invalid field '%s'\n"), optarg); usage(B_FALSE); } break; case 't': if (types == USTYPE_PROJ) { (void) fprintf(stderr, gettext("invalid option 't'\n")); usage(B_FALSE); } tfield = optarg; break; case 'i': if (types == USTYPE_PROJ) { (void) fprintf(stderr, gettext("invalid option 'i'\n")); usage(B_FALSE); } sid2posix = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing dataset name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* Use default output fields if not specified using -o */ if (ofield == NULL) ofield = deffields; do { if ((delim = strchr(ofield, ',')) != NULL) *delim = '\0'; if ((fields[cfield++] = us_field_index(ofield)) == -1) { (void) fprintf(stderr, gettext("invalid type '%s' " "for -o option\n"), ofield); return (-1); } if (delim != NULL) ofield = delim + 1; } while (delim != NULL); fields[cfield] = USFIELD_LAST; /* Override output types (-t option) */ if (tfield != NULL) { types = 0; do { boolean_t found = B_FALSE; if ((delim = strchr(tfield, ',')) != NULL) *delim = '\0'; for (i = 0; i < sizeof (us_type_bits) / sizeof (int); i++) { if (strcmp(tfield, us_type_names[i]) == 0) { found = B_TRUE; types |= us_type_bits[i]; break; } } if (!found) { (void) fprintf(stderr, gettext("invalid type " "'%s' for -t option\n"), tfield); return (-1); } if (delim != NULL) tfield = delim + 1; } while (delim != NULL); } if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT)) == NULL) return (1); if (zhp->zfs_head_type != ZFS_TYPE_FILESYSTEM) { (void) fprintf(stderr, gettext("operation is only applicable " "to filesystems and their snapshots\n")); zfs_close(zhp); return (1); } if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t), offsetof(us_node_t, usn_avlnode), us_compare, UU_DEFAULT)) == NULL) nomem(); if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) nomem(); /* Always add default sorting columns */ (void) zfs_add_sort_column(&sortcol, "type", B_FALSE); (void) zfs_add_sort_column(&sortcol, "name", B_FALSE); cb.cb_sortcol = sortcol; cb.cb_numname = prtnum; cb.cb_nicenum = !parsable; cb.cb_avl_pool = avl_pool; cb.cb_avl = avl_tree; cb.cb_sid2posix = sid2posix; for (i = 0; i < USFIELD_LAST; i++) cb.cb_width[i] = strlen(gettext(us_field_hdr[i])); for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) { if ((zfs_prop_is_user(p) && !(types & (USTYPE_PSX_USR | USTYPE_SMB_USR))) || (zfs_prop_is_group(p) && !(types & (USTYPE_PSX_GRP | USTYPE_SMB_GRP))) || (zfs_prop_is_project(p) && types != USTYPE_PROJ)) continue; cb.cb_prop = p; if ((ret = zfs_userspace(zhp, p, userspace_cb, &cb)) != 0) { zfs_close(zhp); return (ret); } } zfs_close(zhp); /* Sort the list */ if ((node = uu_avl_first(avl_tree)) == NULL) return (0); us_populated = B_TRUE; listpool = uu_list_pool_create("tmplist", sizeof (us_node_t), offsetof(us_node_t, usn_listnode), NULL, UU_DEFAULT); list = uu_list_create(listpool, NULL, UU_DEFAULT); uu_list_node_init(node, &node->usn_listnode, listpool); while (node != NULL) { rmnode = node; node = uu_avl_next(avl_tree, node); uu_avl_remove(avl_tree, rmnode); if (uu_list_find(list, rmnode, NULL, &idx2) == NULL) uu_list_insert(list, rmnode, idx2); } for (node = uu_list_first(list); node != NULL; node = uu_list_next(list, node)) { us_sort_info_t sortinfo = { sortcol, cb.cb_numname }; if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == NULL) uu_avl_insert(avl_tree, node, idx); } uu_list_destroy(list); uu_list_pool_destroy(listpool); /* Print and free node nvlist memory */ print_us(scripted, parsable, fields, types, cb.cb_width, B_TRUE, cb.cb_avl); zfs_free_sort_columns(sortcol); /* Clean up the AVL tree */ if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) nomem(); while ((node = uu_avl_walk_next(walk)) != NULL) { uu_avl_remove(cb.cb_avl, node); free(node); } uu_avl_walk_end(walk); uu_avl_destroy(avl_tree); uu_avl_pool_destroy(avl_pool); return (ret); } /* * list [-Hp][-r|-d max] [-o property[,...]] [-s property] ... [-S property] * [-t type[,...]] [filesystem|volume|snapshot] ... * * -H Scripted mode; elide headers and separate columns by tabs * -p Display values in parsable (literal) format. * -r Recurse over all children * -d Limit recursion by depth. * -o Control which fields to display. * -s Specify sort columns, descending order. * -S Specify sort columns, ascending order. * -t Control which object types to display. * * When given no arguments, list all filesystems in the system. * Otherwise, list the specified datasets, optionally recursing down them if * '-r' is specified. */ typedef struct list_cbdata { boolean_t cb_first; boolean_t cb_literal; boolean_t cb_scripted; zprop_list_t *cb_proplist; } list_cbdata_t; /* * Given a list of columns to display, output appropriate headers for each one. */ static void print_header(list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; char headerbuf[ZFS_MAXPROPLEN]; const char *header; int i; boolean_t first = B_TRUE; boolean_t right_justify; for (; pl != NULL; pl = pl->pl_next) { if (!first) { (void) printf(" "); } else { first = B_FALSE; } right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { header = zfs_prop_column_name(pl->pl_prop); right_justify = zfs_prop_align_right(pl->pl_prop); } else { for (i = 0; pl->pl_user_prop[i] != '\0'; i++) headerbuf[i] = toupper(pl->pl_user_prop[i]); headerbuf[i] = '\0'; header = headerbuf; } if (pl->pl_next == NULL && !right_justify) (void) printf("%s", header); else if (right_justify) (void) printf("%*s", (int)pl->pl_width, header); else (void) printf("%-*s", (int)pl->pl_width, header); } (void) printf("\n"); } /* * Given a dataset and a list of fields, print out all the properties according * to the described layout. */ static void print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; boolean_t first = B_TRUE; char property[ZFS_MAXPROPLEN]; nvlist_t *userprops = zfs_get_user_props(zhp); nvlist_t *propval; char *propstr; boolean_t right_justify; for (; pl != NULL; pl = pl->pl_next) { if (!first) { if (cb->cb_scripted) (void) printf("\t"); else (void) printf(" "); } else { first = B_FALSE; } if (pl->pl_prop == ZFS_PROP_NAME) { (void) strlcpy(property, zfs_get_name(zhp), sizeof (property)); propstr = property; right_justify = zfs_prop_align_right(pl->pl_prop); } else if (pl->pl_prop != ZPROP_INVAL) { if (zfs_prop_get(zhp, pl->pl_prop, property, sizeof (property), NULL, NULL, 0, cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = zfs_prop_align_right(pl->pl_prop); } else if (zfs_prop_userquota(pl->pl_user_prop)) { if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, property, sizeof (property), cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = B_TRUE; } else if (zfs_prop_written(pl->pl_user_prop)) { if (zfs_prop_get_written(zhp, pl->pl_user_prop, property, sizeof (property), cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = B_TRUE; } else { if (nvlist_lookup_nvlist(userprops, pl->pl_user_prop, &propval) != 0) propstr = "-"; else verify(nvlist_lookup_string(propval, ZPROP_VALUE, &propstr) == 0); right_justify = B_FALSE; } /* * If this is being called in scripted mode, or if this is the * last column and it is left-justified, don't include a width * format specifier. */ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) (void) printf("%s", propstr); else if (right_justify) (void) printf("%*s", (int)pl->pl_width, propstr); else (void) printf("%-*s", (int)pl->pl_width, propstr); } (void) printf("\n"); } /* * Generic callback function to list a dataset or snapshot. */ static int list_callback(zfs_handle_t *zhp, void *data) { list_cbdata_t *cbp = data; if (cbp->cb_first) { if (!cbp->cb_scripted) print_header(cbp); cbp->cb_first = B_FALSE; } print_dataset(zhp, cbp); return (0); } static int zfs_do_list(int argc, char **argv) { int c; static char default_fields[] = "name,used,available,referenced,mountpoint"; int types = ZFS_TYPE_DATASET; boolean_t types_specified = B_FALSE; char *fields = NULL; list_cbdata_t cb = { 0 }; char *value; int limit = 0; int ret = 0; zfs_sort_column_t *sortcol = NULL; int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS; /* check options */ while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) { switch (c) { case 'o': fields = optarg; break; case 'p': cb.cb_literal = B_TRUE; flags |= ZFS_ITER_LITERAL_PROPS; break; case 'd': limit = parse_depth(optarg, &flags); break; case 'r': flags |= ZFS_ITER_RECURSE; break; case 'H': cb.cb_scripted = B_TRUE; break; case 's': if (zfs_add_sort_column(&sortcol, optarg, B_FALSE) != 0) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), optarg); usage(B_FALSE); } break; case 'S': if (zfs_add_sort_column(&sortcol, optarg, B_TRUE) != 0) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), optarg); usage(B_FALSE); } break; case 't': types = 0; types_specified = B_TRUE; flags &= ~ZFS_ITER_PROP_LISTSNAPS; while (*optarg != '\0') { static char *type_subopts[] = { "filesystem", "volume", "snapshot", "snap", "bookmark", "all", NULL }; switch (getsubopt(&optarg, type_subopts, &value)) { case 0: types |= ZFS_TYPE_FILESYSTEM; break; case 1: types |= ZFS_TYPE_VOLUME; break; case 2: case 3: types |= ZFS_TYPE_SNAPSHOT; break; case 4: types |= ZFS_TYPE_BOOKMARK; break; case 5: types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK; break; default: (void) fprintf(stderr, gettext("invalid type '%s'\n"), value); usage(B_FALSE); } } break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (fields == NULL) fields = default_fields; /* * If we are only going to list snapshot names and sort by name, * then we can use faster version. */ if (strcmp(fields, "name") == 0 && zfs_sort_only_by_name(sortcol)) flags |= ZFS_ITER_SIMPLE; /* * If "-o space" and no types were specified, don't display snapshots. */ if (strcmp(fields, "space") == 0 && types_specified == B_FALSE) types &= ~ZFS_TYPE_SNAPSHOT; /* * Handle users who want to list all snapshots or bookmarks * of the current dataset (ex. 'zfs list -t snapshot '). */ if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) && argc > 0 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) { flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE); limit = 1; } /* * If the user specifies '-o all', the zprop_get_list() doesn't * normally include the name of the dataset. For 'zfs list', we always * want this property to be first. */ if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) != 0) usage(B_FALSE); cb.cb_first = B_TRUE; ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist, limit, list_callback, &cb); zprop_free_list(cb.cb_proplist); zfs_free_sort_columns(sortcol); if (ret == 0 && cb.cb_first && !cb.cb_scripted) (void) fprintf(stderr, gettext("no datasets available\n")); return (ret); } /* - * zfs rename [-f] + * zfs rename [-fu] * zfs rename [-f] -p - * zfs rename -r + * zfs rename [-u] -r * * Renames the given dataset to another of the same type. * * The '-p' flag creates all the non-existing ancestors of the target first. + * The '-u' flag prevents file systems from being remounted during rename. */ /* ARGSUSED */ static int zfs_do_rename(int argc, char **argv) { zfs_handle_t *zhp; + renameflags_t flags = { 0 }; int c; int ret = 0; - boolean_t recurse = B_FALSE; + int types; boolean_t parents = B_FALSE; - boolean_t force_unmount = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, "prf")) != -1) { + while ((c = getopt(argc, argv, "pruf")) != -1) { switch (c) { case 'p': parents = B_TRUE; break; case 'r': - recurse = B_TRUE; + flags.recursive = B_TRUE; break; + case 'u': + flags.nounmount = B_TRUE; + break; case 'f': - force_unmount = B_TRUE; + flags.forceunmount = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing source dataset " "argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing target dataset " "argument\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } - if (recurse && parents) { + if (flags.recursive && parents) { (void) fprintf(stderr, gettext("-p and -r options are mutually " "exclusive\n")); usage(B_FALSE); } - if (recurse && strchr(argv[0], '@') == 0) { + if (flags.nounmount && parents) { + (void) fprintf(stderr, gettext("-u and -p options are mutually " + "exclusive\n")); + usage(B_FALSE); + } + + if (flags.recursive && strchr(argv[0], '@') == 0) { (void) fprintf(stderr, gettext("source dataset for recursive " "rename must be a snapshot\n")); usage(B_FALSE); } - if ((zhp = zfs_open(g_zfs, argv[0], parents ? ZFS_TYPE_FILESYSTEM | - ZFS_TYPE_VOLUME : ZFS_TYPE_DATASET)) == NULL) + if (flags.nounmount) + types = ZFS_TYPE_FILESYSTEM; + else if (parents) + types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; + else + types = ZFS_TYPE_DATASET; + + if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL) return (1); /* If we were asked and the name looks good, try to create ancestors. */ if (parents && zfs_name_valid(argv[1], zfs_get_type(zhp)) && zfs_create_ancestors(g_zfs, argv[1]) != 0) { zfs_close(zhp); return (1); } - ret = (zfs_rename(zhp, argv[1], recurse, force_unmount) != 0); + ret = (zfs_rename(zhp, argv[1], flags) != 0); zfs_close(zhp); return (ret); } /* * zfs promote * * Promotes the given clone fs to be the parent */ /* ARGSUSED */ static int zfs_do_promote(int argc, char **argv) { zfs_handle_t *zhp; int ret = 0; /* check options */ if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } /* check number of arguments */ if (argc < 2) { (void) fprintf(stderr, gettext("missing clone filesystem" " argument\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (1); ret = (zfs_promote(zhp) != 0); zfs_close(zhp); return (ret); } static int zfs_do_redact(int argc, char **argv) { char *snap = NULL; char *bookname = NULL; char **rsnaps = NULL; int numrsnaps = 0; argv++; argc--; if (argc < 3) { (void) fprintf(stderr, gettext("too few arguments\n")); usage(B_FALSE); } snap = argv[0]; bookname = argv[1]; rsnaps = argv + 2; numrsnaps = argc - 2; nvlist_t *rsnapnv = fnvlist_alloc(); for (int i = 0; i < numrsnaps; i++) { fnvlist_add_boolean(rsnapnv, rsnaps[i]); } int err = lzc_redact(snap, bookname, rsnapnv); fnvlist_free(rsnapnv); switch (err) { case 0: break; case ENOENT: (void) fprintf(stderr, gettext("provided snapshot %s does not exist\n"), snap); break; case EEXIST: (void) fprintf(stderr, gettext("specified redaction bookmark " "(%s) provided already exists\n"), bookname); break; case ENAMETOOLONG: (void) fprintf(stderr, gettext("provided bookmark name cannot " "be used, final name would be too long\n")); break; case E2BIG: (void) fprintf(stderr, gettext("too many redaction snapshots " "specified\n")); break; case EINVAL: if (strchr(bookname, '#') != NULL) (void) fprintf(stderr, gettext( "redaction bookmark name must not contain '#'\n")); else (void) fprintf(stderr, gettext( "redaction snapshot must be descendent of " "snapshot being redacted\n")); break; case EALREADY: (void) fprintf(stderr, gettext("attempted to redact redacted " "dataset or with respect to redacted dataset\n")); break; case ENOTSUP: (void) fprintf(stderr, gettext("redaction bookmarks feature " "not enabled\n")); break; case EXDEV: (void) fprintf(stderr, gettext("potentially invalid redaction " "snapshot; full dataset names required\n")); break; default: (void) fprintf(stderr, gettext("internal error: %s\n"), strerror(errno)); } return (err); } /* * zfs rollback [-rRf] * * -r Delete any intervening snapshots before doing rollback * -R Delete any snapshots and their clones * -f ignored for backwards compatibility * * Given a filesystem, rollback to a specific snapshot, discarding any changes * since then and making it the active dataset. If more recent snapshots exist, * the command will complain unless the '-r' flag is given. */ typedef struct rollback_cbdata { uint64_t cb_create; uint8_t cb_younger_ds_printed; boolean_t cb_first; int cb_doclones; char *cb_target; int cb_error; boolean_t cb_recurse; } rollback_cbdata_t; static int rollback_check_dependent(zfs_handle_t *zhp, void *data) { rollback_cbdata_t *cbp = data; if (cbp->cb_first && cbp->cb_recurse) { (void) fprintf(stderr, gettext("cannot rollback to " "'%s': clones of previous snapshots exist\n"), cbp->cb_target); (void) fprintf(stderr, gettext("use '-R' to " "force deletion of the following clones and " "dependents:\n")); cbp->cb_first = 0; cbp->cb_error = 1; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); zfs_close(zhp); return (0); } /* * Report some snapshots/bookmarks more recent than the one specified. * Used when '-r' is not specified. We reuse this same callback for the * snapshot dependents - if 'cb_dependent' is set, then this is a * dependent and we should report it without checking the transaction group. */ static int rollback_check(zfs_handle_t *zhp, void *data) { rollback_cbdata_t *cbp = data; /* * Max number of younger snapshots and/or bookmarks to display before * we stop the iteration. */ const uint8_t max_younger = 32; if (cbp->cb_doclones) { zfs_close(zhp); return (0); } if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { if (cbp->cb_first && !cbp->cb_recurse) { (void) fprintf(stderr, gettext("cannot " "rollback to '%s': more recent snapshots " "or bookmarks exist\n"), cbp->cb_target); (void) fprintf(stderr, gettext("use '-r' to " "force deletion of the following " "snapshots and bookmarks:\n")); cbp->cb_first = 0; cbp->cb_error = 1; } if (cbp->cb_recurse) { if (zfs_iter_dependents(zhp, B_TRUE, rollback_check_dependent, cbp) != 0) { zfs_close(zhp); return (-1); } } else { (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); cbp->cb_younger_ds_printed++; } } zfs_close(zhp); if (cbp->cb_younger_ds_printed == max_younger) { /* * This non-recursive rollback is going to fail due to the * presence of snapshots and/or bookmarks that are younger than * the rollback target. * We printed some of the offending objects, now we stop * zfs_iter_snapshot/bookmark iteration so we can fail fast and * avoid iterating over the rest of the younger objects */ (void) fprintf(stderr, gettext("Output limited to %d " "snapshots/bookmarks\n"), max_younger); return (-1); } return (0); } static int zfs_do_rollback(int argc, char **argv) { int ret = 0; int c; boolean_t force = B_FALSE; rollback_cbdata_t cb = { 0 }; zfs_handle_t *zhp, *snap; char parentname[ZFS_MAX_DATASET_NAME_LEN]; char *delim; uint64_t min_txg = 0; /* check options */ while ((c = getopt(argc, argv, "rRf")) != -1) { switch (c) { case 'r': cb.cb_recurse = 1; break; case 'R': cb.cb_recurse = 1; cb.cb_doclones = 1; break; case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* open the snapshot */ if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) return (1); /* open the parent dataset */ (void) strlcpy(parentname, argv[0], sizeof (parentname)); verify((delim = strrchr(parentname, '@')) != NULL); *delim = '\0'; if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_DATASET)) == NULL) { zfs_close(snap); return (1); } /* * Check for more recent snapshots and/or clones based on the presence * of '-r' and '-R'. */ cb.cb_target = argv[0]; cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); cb.cb_first = B_TRUE; cb.cb_error = 0; if (cb.cb_create > 0) min_txg = cb.cb_create; if ((ret = zfs_iter_snapshots(zhp, B_FALSE, rollback_check, &cb, min_txg, 0)) != 0) goto out; if ((ret = zfs_iter_bookmarks(zhp, rollback_check, &cb)) != 0) goto out; if ((ret = cb.cb_error) != 0) goto out; /* * Rollback parent to the given snapshot. */ ret = zfs_rollback(zhp, snap, force); out: zfs_close(snap); zfs_close(zhp); if (ret == 0) return (0); else return (1); } /* * zfs set property=value ... { fs | snap | vol } ... * * Sets the given properties for all datasets specified on the command line. */ static int set_callback(zfs_handle_t *zhp, void *data) { nvlist_t *props = data; if (zfs_prop_set_list(zhp, props) != 0) { switch (libzfs_errno(g_zfs)) { case EZFS_MOUNTFAILED: (void) fprintf(stderr, gettext("property may be set " "but unable to remount filesystem\n")); break; case EZFS_SHARENFSFAILED: (void) fprintf(stderr, gettext("property may be set " "but unable to reshare filesystem\n")); break; } return (1); } return (0); } static int zfs_do_set(int argc, char **argv) { nvlist_t *props = NULL; int ds_start = -1; /* argv idx of first dataset arg */ int ret = 0; int i; /* check for options */ if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } /* check number of arguments */ if (argc < 2) { (void) fprintf(stderr, gettext("missing arguments\n")); usage(B_FALSE); } if (argc < 3) { if (strchr(argv[1], '=') == NULL) { (void) fprintf(stderr, gettext("missing property=value " "argument(s)\n")); } else { (void) fprintf(stderr, gettext("missing dataset " "name(s)\n")); } usage(B_FALSE); } /* validate argument order: prop=val args followed by dataset args */ for (i = 1; i < argc; i++) { if (strchr(argv[i], '=') != NULL) { if (ds_start > 0) { /* out-of-order prop=val argument */ (void) fprintf(stderr, gettext("invalid " "argument order\n")); usage(B_FALSE); } } else if (ds_start < 0) { ds_start = i; } } if (ds_start < 0) { (void) fprintf(stderr, gettext("missing dataset name(s)\n")); usage(B_FALSE); } /* Populate a list of property settings */ if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); for (i = 1; i < ds_start; i++) { if (!parseprop(props, argv[i])) { ret = -1; goto error; } } ret = zfs_for_each(argc - ds_start, argv + ds_start, 0, ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, props); error: nvlist_free(props); return (ret); } typedef struct snap_cbdata { nvlist_t *sd_nvl; boolean_t sd_recursive; const char *sd_snapname; } snap_cbdata_t; static int zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) { snap_cbdata_t *sd = arg; char *name; int rv = 0; int error; if (sd->sd_recursive && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) { zfs_close(zhp); return (0); } error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname); if (error == -1) nomem(); fnvlist_add_boolean(sd->sd_nvl, name); free(name); if (sd->sd_recursive) rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); zfs_close(zhp); return (rv); } /* * zfs snapshot [-r] [-o prop=value] ... * * Creates a snapshot with the given name. While functionally equivalent to * 'zfs create', it is a separate command to differentiate intent. */ static int zfs_do_snapshot(int argc, char **argv) { int ret = 0; int c; nvlist_t *props; snap_cbdata_t sd = { 0 }; boolean_t multiple_snaps = B_FALSE; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, "ro:")) != -1) { switch (c) { case 'o': if (!parseprop(props, optarg)) { nvlist_free(sd.sd_nvl); nvlist_free(props); return (1); } break; case 'r': sd.sd_recursive = B_TRUE; multiple_snaps = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing snapshot argument\n")); goto usage; } if (argc > 1) multiple_snaps = B_TRUE; for (; argc > 0; argc--, argv++) { char *atp; zfs_handle_t *zhp; atp = strchr(argv[0], '@'); if (atp == NULL) goto usage; *atp = '\0'; sd.sd_snapname = atp + 1; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) goto usage; if (zfs_snapshot_cb(zhp, &sd) != 0) goto usage; } ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props); nvlist_free(sd.sd_nvl); nvlist_free(props); if (ret != 0 && multiple_snaps) (void) fprintf(stderr, gettext("no snapshots were created\n")); return (ret != 0); usage: nvlist_free(sd.sd_nvl); nvlist_free(props); usage(B_FALSE); return (-1); } /* * Send a backup stream to stdout. */ static int zfs_do_send(int argc, char **argv) { char *fromname = NULL; char *toname = NULL; char *resume_token = NULL; char *cp; zfs_handle_t *zhp; sendflags_t flags = { 0 }; int c, err; nvlist_t *dbgnv = NULL; char *redactbook = NULL; struct option long_options[] = { {"replicate", no_argument, NULL, 'R'}, {"redact", required_argument, NULL, 'd'}, {"props", no_argument, NULL, 'p'}, {"parsable", no_argument, NULL, 'P'}, {"dedup", no_argument, NULL, 'D'}, {"verbose", no_argument, NULL, 'v'}, {"dryrun", no_argument, NULL, 'n'}, {"large-block", no_argument, NULL, 'L'}, {"embed", no_argument, NULL, 'e'}, {"resume", required_argument, NULL, 't'}, {"compressed", no_argument, NULL, 'c'}, {"raw", no_argument, NULL, 'w'}, {"backup", no_argument, NULL, 'b'}, {"holds", no_argument, NULL, 'h'}, {"saved", no_argument, NULL, 'S'}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, ":i:I:RDpvnPLeht:cwbd:S", long_options, NULL)) != -1) { switch (c) { case 'i': if (fromname) usage(B_FALSE); fromname = optarg; break; case 'I': if (fromname) usage(B_FALSE); fromname = optarg; flags.doall = B_TRUE; break; case 'R': flags.replicate = B_TRUE; break; case 'd': redactbook = optarg; break; case 'p': flags.props = B_TRUE; break; case 'b': flags.backup = B_TRUE; break; case 'h': flags.holds = B_TRUE; break; case 'P': flags.parsable = B_TRUE; break; case 'v': flags.verbosity++; flags.progress = B_TRUE; break; case 'D': (void) fprintf(stderr, gettext("WARNING: deduplicated send is no " "longer supported. A regular,\n" "non-deduplicated stream will be generated.\n\n")); break; case 'n': flags.dryrun = B_TRUE; break; case 'L': flags.largeblock = B_TRUE; break; case 'e': flags.embed_data = B_TRUE; break; case 't': resume_token = optarg; break; case 'c': flags.compress = B_TRUE; break; case 'w': flags.raw = B_TRUE; flags.compress = B_TRUE; flags.embed_data = B_TRUE; flags.largeblock = B_TRUE; break; case 'S': flags.saved = B_TRUE; break; case ':': /* * If a parameter was not passed, optopt contains the * value that would normally lead us into the * appropriate case statement. If it's > 256, then this * must be a longopt and we should look at argv to get * the string. Otherwise it's just the character, so we * should use it directly. */ if (optopt <= UINT8_MAX) { (void) fprintf(stderr, gettext("missing argument for '%c' " "option\n"), optopt); } else { (void) fprintf(stderr, gettext("missing argument for '%s' " "option\n"), argv[optind - 1]); } usage(B_FALSE); break; case '?': /*FALLTHROUGH*/ default: /* * If an invalid flag was passed, optopt contains the * character if it was a short flag, or 0 if it was a * longopt. */ if (optopt != 0) { (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } else { (void) fprintf(stderr, gettext("invalid option '%s'\n"), argv[optind - 1]); } usage(B_FALSE); } } if (flags.parsable && flags.verbosity == 0) flags.verbosity = 1; argc -= optind; argv += optind; if (resume_token != NULL) { if (fromname != NULL || flags.replicate || flags.props || flags.backup || flags.holds || flags.saved || redactbook != NULL) { (void) fprintf(stderr, gettext("invalid flags combined with -t\n")); usage(B_FALSE); } if (argc > 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } else { if (argc < 1) { (void) fprintf(stderr, gettext("missing snapshot argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } if (flags.saved) { if (fromname != NULL || flags.replicate || flags.props || flags.doall || flags.backup || flags.holds || flags.largeblock || flags.embed_data || flags.compress || flags.raw || redactbook != NULL) { (void) fprintf(stderr, gettext("incompatible flags " "combined with saved send flag\n")); usage(B_FALSE); } if (strchr(argv[0], '@') != NULL) { (void) fprintf(stderr, gettext("saved send must " "specify the dataset with partially-received " "state\n")); usage(B_FALSE); } } if (flags.raw && redactbook != NULL) { (void) fprintf(stderr, gettext("Error: raw sends may not be redacted.\n")); return (1); } if (!flags.dryrun && isatty(STDOUT_FILENO)) { (void) fprintf(stderr, gettext("Error: Stream can not be written to a terminal.\n" "You must redirect standard output.\n")); return (1); } if (flags.saved) { zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); if (zhp == NULL) return (1); err = zfs_send_saved(zhp, &flags, STDOUT_FILENO, resume_token); zfs_close(zhp); return (err != 0); } else if (resume_token != NULL) { return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO, resume_token)); } /* * For everything except -R and -I, use the new, cleaner code path. */ if (!(flags.replicate || flags.doall)) { char frombuf[ZFS_MAX_DATASET_NAME_LEN]; if (fromname != NULL && (strchr(fromname, '#') == NULL && strchr(fromname, '@') == NULL)) { /* * Neither bookmark or snapshot was specified. Print a * warning, and assume snapshot. */ (void) fprintf(stderr, "Warning: incremental source " "didn't specify type, assuming snapshot. Use '@' " "or '#' prefix to avoid ambiguity.\n"); (void) snprintf(frombuf, sizeof (frombuf), "@%s", fromname); fromname = frombuf; } if (fromname != NULL && (fromname[0] == '#' || fromname[0] == '@')) { /* * Incremental source name begins with # or @. * Default to same fs as target. */ char tmpbuf[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(tmpbuf, fromname, sizeof (tmpbuf)); (void) strlcpy(frombuf, argv[0], sizeof (frombuf)); cp = strchr(frombuf, '@'); if (cp != NULL) *cp = '\0'; (void) strlcat(frombuf, tmpbuf, sizeof (frombuf)); fromname = frombuf; } zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); if (zhp == NULL) return (1); err = zfs_send_one(zhp, fromname, STDOUT_FILENO, &flags, redactbook); zfs_close(zhp); return (err != 0); } if (fromname != NULL && strchr(fromname, '#')) { (void) fprintf(stderr, gettext("Error: multiple snapshots cannot be " "sent from a bookmark.\n")); return (1); } if (redactbook != NULL) { (void) fprintf(stderr, gettext("Error: multiple snapshots " "cannot be sent redacted.\n")); return (1); } if ((cp = strchr(argv[0], '@')) == NULL) { (void) fprintf(stderr, gettext("Error: " "Unsupported flag with filesystem or bookmark.\n")); return (1); } *cp = '\0'; toname = cp + 1; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (1); /* * If they specified the full path to the snapshot, chop off * everything except the short name of the snapshot, but special * case if they specify the origin. */ if (fromname && (cp = strchr(fromname, '@')) != NULL) { char origin[ZFS_MAX_DATASET_NAME_LEN]; zprop_source_t src; (void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN, origin, sizeof (origin), &src, NULL, 0, B_FALSE); if (strcmp(origin, fromname) == 0) { fromname = NULL; flags.fromorigin = B_TRUE; } else { *cp = '\0'; if (cp != fromname && strcmp(argv[0], fromname)) { (void) fprintf(stderr, gettext("incremental source must be " "in same filesystem\n")); usage(B_FALSE); } fromname = cp + 1; if (strchr(fromname, '@') || strchr(fromname, '/')) { (void) fprintf(stderr, gettext("invalid incremental source\n")); usage(B_FALSE); } } } if (flags.replicate && fromname == NULL) flags.doall = B_TRUE; err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, NULL, 0, flags.verbosity >= 3 ? &dbgnv : NULL); if (flags.verbosity >= 3 && dbgnv != NULL) { /* * dump_nvlist prints to stdout, but that's been * redirected to a file. Make it print to stderr * instead. */ (void) dup2(STDERR_FILENO, STDOUT_FILENO); dump_nvlist(dbgnv, 0); nvlist_free(dbgnv); } zfs_close(zhp); return (err != 0); } /* * Restore a backup stream from stdin. */ static int zfs_do_receive(int argc, char **argv) { int c, err = 0; recvflags_t flags = { 0 }; boolean_t abort_resumable = B_FALSE; nvlist_t *props; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, ":o:x:dehMnuvFsA")) != -1) { switch (c) { case 'o': if (!parseprop(props, optarg)) { nvlist_free(props); usage(B_FALSE); } break; case 'x': if (!parsepropname(props, optarg)) { nvlist_free(props); usage(B_FALSE); } break; case 'd': if (flags.istail) { (void) fprintf(stderr, gettext("invalid option " "combination: -d and -e are mutually " "exclusive\n")); usage(B_FALSE); } flags.isprefix = B_TRUE; break; case 'e': if (flags.isprefix) { (void) fprintf(stderr, gettext("invalid option " "combination: -d and -e are mutually " "exclusive\n")); usage(B_FALSE); } flags.istail = B_TRUE; break; case 'h': flags.skipholds = B_TRUE; break; case 'M': flags.forceunmount = B_TRUE; break; case 'n': flags.dryrun = B_TRUE; break; case 'u': flags.nomount = B_TRUE; break; case 'v': flags.verbose = B_TRUE; break; case 's': flags.resumable = B_TRUE; break; case 'F': flags.force = B_TRUE; break; case 'A': abort_resumable = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* zfs recv -e (use "tail" name) implies -d (remove dataset "head") */ if (flags.istail) flags.isprefix = B_TRUE; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing snapshot argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (abort_resumable) { if (flags.isprefix || flags.istail || flags.dryrun || flags.resumable || flags.nomount) { (void) fprintf(stderr, gettext("invalid option\n")); usage(B_FALSE); } char namebuf[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(namebuf, sizeof (namebuf), "%s/%%recv", argv[0]); if (zfs_dataset_exists(g_zfs, namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { zfs_handle_t *zhp = zfs_open(g_zfs, namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { nvlist_free(props); return (1); } err = zfs_destroy(zhp, B_FALSE); zfs_close(zhp); } else { zfs_handle_t *zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) usage(B_FALSE); if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) || zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0, NULL, NULL, 0, B_TRUE) == -1) { (void) fprintf(stderr, gettext("'%s' does not have any " "resumable receive state to abort\n"), argv[0]); nvlist_free(props); zfs_close(zhp); return (1); } err = zfs_destroy(zhp, B_FALSE); zfs_close(zhp); } nvlist_free(props); return (err != 0); } if (isatty(STDIN_FILENO)) { (void) fprintf(stderr, gettext("Error: Backup stream can not be read " "from a terminal.\n" "You must redirect standard input.\n")); nvlist_free(props); return (1); } err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL); nvlist_free(props); return (err != 0); } /* * allow/unallow stuff */ /* copied from zfs/sys/dsl_deleg.h */ #define ZFS_DELEG_PERM_CREATE "create" #define ZFS_DELEG_PERM_DESTROY "destroy" #define ZFS_DELEG_PERM_SNAPSHOT "snapshot" #define ZFS_DELEG_PERM_ROLLBACK "rollback" #define ZFS_DELEG_PERM_CLONE "clone" #define ZFS_DELEG_PERM_PROMOTE "promote" #define ZFS_DELEG_PERM_RENAME "rename" #define ZFS_DELEG_PERM_MOUNT "mount" #define ZFS_DELEG_PERM_SHARE "share" #define ZFS_DELEG_PERM_SEND "send" #define ZFS_DELEG_PERM_RECEIVE "receive" #define ZFS_DELEG_PERM_ALLOW "allow" #define ZFS_DELEG_PERM_USERPROP "userprop" #define ZFS_DELEG_PERM_VSCAN "vscan" /* ??? */ #define ZFS_DELEG_PERM_USERQUOTA "userquota" #define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" #define ZFS_DELEG_PERM_USERUSED "userused" #define ZFS_DELEG_PERM_GROUPUSED "groupused" #define ZFS_DELEG_PERM_USEROBJQUOTA "userobjquota" #define ZFS_DELEG_PERM_GROUPOBJQUOTA "groupobjquota" #define ZFS_DELEG_PERM_USEROBJUSED "userobjused" #define ZFS_DELEG_PERM_GROUPOBJUSED "groupobjused" #define ZFS_DELEG_PERM_HOLD "hold" #define ZFS_DELEG_PERM_RELEASE "release" #define ZFS_DELEG_PERM_DIFF "diff" #define ZFS_DELEG_PERM_BOOKMARK "bookmark" #define ZFS_DELEG_PERM_LOAD_KEY "load-key" #define ZFS_DELEG_PERM_CHANGE_KEY "change-key" #define ZFS_DELEG_PERM_PROJECTUSED "projectused" #define ZFS_DELEG_PERM_PROJECTQUOTA "projectquota" #define ZFS_DELEG_PERM_PROJECTOBJUSED "projectobjused" #define ZFS_DELEG_PERM_PROJECTOBJQUOTA "projectobjquota" #define ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = { { ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW }, { ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE }, { ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE }, { ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY }, { ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF}, { ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD }, { ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT }, { ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE }, { ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE }, { ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE }, { ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME }, { ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK }, { ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND }, { ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, { ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, { ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK }, { ZFS_DELEG_PERM_LOAD_KEY, ZFS_DELEG_NOTE_LOAD_KEY }, { ZFS_DELEG_PERM_CHANGE_KEY, ZFS_DELEG_NOTE_CHANGE_KEY }, { ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA }, { ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED }, { ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP }, { ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA }, { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED }, { ZFS_DELEG_PERM_USEROBJQUOTA, ZFS_DELEG_NOTE_USEROBJQUOTA }, { ZFS_DELEG_PERM_USEROBJUSED, ZFS_DELEG_NOTE_USEROBJUSED }, { ZFS_DELEG_PERM_GROUPOBJQUOTA, ZFS_DELEG_NOTE_GROUPOBJQUOTA }, { ZFS_DELEG_PERM_GROUPOBJUSED, ZFS_DELEG_NOTE_GROUPOBJUSED }, { ZFS_DELEG_PERM_PROJECTUSED, ZFS_DELEG_NOTE_PROJECTUSED }, { ZFS_DELEG_PERM_PROJECTQUOTA, ZFS_DELEG_NOTE_PROJECTQUOTA }, { ZFS_DELEG_PERM_PROJECTOBJUSED, ZFS_DELEG_NOTE_PROJECTOBJUSED }, { ZFS_DELEG_PERM_PROJECTOBJQUOTA, ZFS_DELEG_NOTE_PROJECTOBJQUOTA }, { NULL, ZFS_DELEG_NOTE_NONE } }; /* permission structure */ typedef struct deleg_perm { zfs_deleg_who_type_t dp_who_type; const char *dp_name; boolean_t dp_local; boolean_t dp_descend; } deleg_perm_t; /* */ typedef struct deleg_perm_node { deleg_perm_t dpn_perm; uu_avl_node_t dpn_avl_node; } deleg_perm_node_t; typedef struct fs_perm fs_perm_t; /* permissions set */ typedef struct who_perm { zfs_deleg_who_type_t who_type; const char *who_name; /* id */ char who_ug_name[256]; /* user/group name */ fs_perm_t *who_fsperm; /* uplink */ uu_avl_t *who_deleg_perm_avl; /* permissions */ } who_perm_t; /* */ typedef struct who_perm_node { who_perm_t who_perm; uu_avl_node_t who_avl_node; } who_perm_node_t; typedef struct fs_perm_set fs_perm_set_t; /* fs permissions */ struct fs_perm { const char *fsp_name; uu_avl_t *fsp_sc_avl; /* sets,create */ uu_avl_t *fsp_uge_avl; /* user,group,everyone */ fs_perm_set_t *fsp_set; /* uplink */ }; /* */ typedef struct fs_perm_node { fs_perm_t fspn_fsperm; uu_avl_t *fspn_avl; uu_list_node_t fspn_list_node; } fs_perm_node_t; /* top level structure */ struct fs_perm_set { uu_list_pool_t *fsps_list_pool; uu_list_t *fsps_list; /* list of fs_perms */ uu_avl_pool_t *fsps_named_set_avl_pool; uu_avl_pool_t *fsps_who_perm_avl_pool; uu_avl_pool_t *fsps_deleg_perm_avl_pool; }; static inline const char * deleg_perm_type(zfs_deleg_note_t note) { /* subcommands */ switch (note) { /* SUBCOMMANDS */ /* OTHER */ case ZFS_DELEG_NOTE_GROUPQUOTA: case ZFS_DELEG_NOTE_GROUPUSED: case ZFS_DELEG_NOTE_USERPROP: case ZFS_DELEG_NOTE_USERQUOTA: case ZFS_DELEG_NOTE_USERUSED: case ZFS_DELEG_NOTE_USEROBJQUOTA: case ZFS_DELEG_NOTE_USEROBJUSED: case ZFS_DELEG_NOTE_GROUPOBJQUOTA: case ZFS_DELEG_NOTE_GROUPOBJUSED: case ZFS_DELEG_NOTE_PROJECTUSED: case ZFS_DELEG_NOTE_PROJECTQUOTA: case ZFS_DELEG_NOTE_PROJECTOBJUSED: case ZFS_DELEG_NOTE_PROJECTOBJQUOTA: /* other */ return (gettext("other")); default: return (gettext("subcommand")); } } static int who_type2weight(zfs_deleg_who_type_t who_type) { int res; switch (who_type) { case ZFS_DELEG_NAMED_SET_SETS: case ZFS_DELEG_NAMED_SET: res = 0; break; case ZFS_DELEG_CREATE_SETS: case ZFS_DELEG_CREATE: res = 1; break; case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: res = 2; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: res = 3; break; case ZFS_DELEG_EVERYONE_SETS: case ZFS_DELEG_EVERYONE: res = 4; break; default: res = -1; } return (res); } /* ARGSUSED */ static int who_perm_compare(const void *larg, const void *rarg, void *unused) { const who_perm_node_t *l = larg; const who_perm_node_t *r = rarg; zfs_deleg_who_type_t ltype = l->who_perm.who_type; zfs_deleg_who_type_t rtype = r->who_perm.who_type; int lweight = who_type2weight(ltype); int rweight = who_type2weight(rtype); int res = lweight - rweight; if (res == 0) res = strncmp(l->who_perm.who_name, r->who_perm.who_name, ZFS_MAX_DELEG_NAME-1); if (res == 0) return (0); if (res > 0) return (1); else return (-1); } /* ARGSUSED */ static int deleg_perm_compare(const void *larg, const void *rarg, void *unused) { const deleg_perm_node_t *l = larg; const deleg_perm_node_t *r = rarg; int res = strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name, ZFS_MAX_DELEG_NAME-1); if (res == 0) return (0); if (res > 0) return (1); else return (-1); } static inline void fs_perm_set_init(fs_perm_set_t *fspset) { bzero(fspset, sizeof (fs_perm_set_t)); if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool", sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node), NULL, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create( "named_set_avl_pool", sizeof (who_perm_node_t), offsetof( who_perm_node_t, who_avl_node), who_perm_compare, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create( "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof( who_perm_node_t, who_avl_node), who_perm_compare, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create( "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof( deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT)) == NULL) nomem(); } static inline void fs_perm_fini(fs_perm_t *); static inline void who_perm_fini(who_perm_t *); static inline void fs_perm_set_fini(fs_perm_set_t *fspset) { fs_perm_node_t *node = uu_list_first(fspset->fsps_list); while (node != NULL) { fs_perm_node_t *next_node = uu_list_next(fspset->fsps_list, node); fs_perm_t *fsperm = &node->fspn_fsperm; fs_perm_fini(fsperm); uu_list_remove(fspset->fsps_list, node); free(node); node = next_node; } uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool); uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool); uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool); } static inline void deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type, const char *name) { deleg_perm->dp_who_type = type; deleg_perm->dp_name = name; } static inline void who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm, zfs_deleg_who_type_t type, const char *name) { uu_avl_pool_t *pool; pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool; bzero(who_perm, sizeof (who_perm_t)); if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL) nomem(); who_perm->who_type = type; who_perm->who_name = name; who_perm->who_fsperm = fsperm; } static inline void who_perm_fini(who_perm_t *who_perm) { deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl); while (node != NULL) { deleg_perm_node_t *next_node = uu_avl_next(who_perm->who_deleg_perm_avl, node); uu_avl_remove(who_perm->who_deleg_perm_avl, node); free(node); node = next_node; } uu_avl_destroy(who_perm->who_deleg_perm_avl); } static inline void fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname) { uu_avl_pool_t *nset_pool = fspset->fsps_named_set_avl_pool; uu_avl_pool_t *who_pool = fspset->fsps_who_perm_avl_pool; bzero(fsperm, sizeof (fs_perm_t)); if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT)) == NULL) nomem(); if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT)) == NULL) nomem(); fsperm->fsp_set = fspset; fsperm->fsp_name = fsname; } static inline void fs_perm_fini(fs_perm_t *fsperm) { who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl); while (node != NULL) { who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl, node); who_perm_t *who_perm = &node->who_perm; who_perm_fini(who_perm); uu_avl_remove(fsperm->fsp_sc_avl, node); free(node); node = next_node; } node = uu_avl_first(fsperm->fsp_uge_avl); while (node != NULL) { who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl, node); who_perm_t *who_perm = &node->who_perm; who_perm_fini(who_perm); uu_avl_remove(fsperm->fsp_uge_avl, node); free(node); node = next_node; } uu_avl_destroy(fsperm->fsp_sc_avl); uu_avl_destroy(fsperm->fsp_uge_avl); } static void set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node, zfs_deleg_who_type_t who_type, const char *name, char locality) { uu_avl_index_t idx = 0; deleg_perm_node_t *found_node = NULL; deleg_perm_t *deleg_perm = &node->dpn_perm; deleg_perm_init(deleg_perm, who_type, name); if ((found_node = uu_avl_find(avl, node, NULL, &idx)) == NULL) uu_avl_insert(avl, node, idx); else { node = found_node; deleg_perm = &node->dpn_perm; } switch (locality) { case ZFS_DELEG_LOCAL: deleg_perm->dp_local = B_TRUE; break; case ZFS_DELEG_DESCENDENT: deleg_perm->dp_descend = B_TRUE; break; case ZFS_DELEG_NA: break; default: assert(B_FALSE); /* invalid locality */ } } static inline int parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality) { nvpair_t *nvp = NULL; fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set; uu_avl_t *avl = who_perm->who_deleg_perm_avl; zfs_deleg_who_type_t who_type = who_perm->who_type; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { const char *name = nvpair_name(nvp); data_type_t type = nvpair_type(nvp); uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool; deleg_perm_node_t *node = safe_malloc(sizeof (deleg_perm_node_t)); VERIFY(type == DATA_TYPE_BOOLEAN); uu_avl_node_init(node, &node->dpn_avl_node, avl_pool); set_deleg_perm_node(avl, node, who_type, name, locality); } return (0); } static inline int parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl) { nvpair_t *nvp = NULL; fs_perm_set_t *fspset = fsperm->fsp_set; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { nvlist_t *nvl2 = NULL; const char *name = nvpair_name(nvp); uu_avl_t *avl = NULL; uu_avl_pool_t *avl_pool = NULL; zfs_deleg_who_type_t perm_type = name[0]; char perm_locality = name[1]; const char *perm_name = name + 3; who_perm_t *who_perm = NULL; assert('$' == name[2]); if (nvpair_value_nvlist(nvp, &nvl2) != 0) return (-1); switch (perm_type) { case ZFS_DELEG_CREATE: case ZFS_DELEG_CREATE_SETS: case ZFS_DELEG_NAMED_SET: case ZFS_DELEG_NAMED_SET_SETS: avl_pool = fspset->fsps_named_set_avl_pool; avl = fsperm->fsp_sc_avl; break; case ZFS_DELEG_USER: case ZFS_DELEG_USER_SETS: case ZFS_DELEG_GROUP: case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_EVERYONE: case ZFS_DELEG_EVERYONE_SETS: avl_pool = fspset->fsps_who_perm_avl_pool; avl = fsperm->fsp_uge_avl; break; default: assert(!"unhandled zfs_deleg_who_type_t"); } who_perm_node_t *found_node = NULL; who_perm_node_t *node = safe_malloc( sizeof (who_perm_node_t)); who_perm = &node->who_perm; uu_avl_index_t idx = 0; uu_avl_node_init(node, &node->who_avl_node, avl_pool); who_perm_init(who_perm, fsperm, perm_type, perm_name); if ((found_node = uu_avl_find(avl, node, NULL, &idx)) == NULL) { if (avl == fsperm->fsp_uge_avl) { uid_t rid = 0; struct passwd *p = NULL; struct group *g = NULL; const char *nice_name = NULL; switch (perm_type) { case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: rid = atoi(perm_name); p = getpwuid(rid); if (p) nice_name = p->pw_name; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: rid = atoi(perm_name); g = getgrgid(rid); if (g) nice_name = g->gr_name; break; default: break; } if (nice_name != NULL) { (void) strlcpy( node->who_perm.who_ug_name, nice_name, 256); } else { /* User or group unknown */ (void) snprintf( node->who_perm.who_ug_name, sizeof (node->who_perm.who_ug_name), "(unknown: %d)", rid); } } uu_avl_insert(avl, node, idx); } else { node = found_node; who_perm = &node->who_perm; } assert(who_perm != NULL); (void) parse_who_perm(who_perm, nvl2, perm_locality); } return (0); } static inline int parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl) { nvpair_t *nvp = NULL; uu_avl_index_t idx = 0; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { nvlist_t *nvl2 = NULL; const char *fsname = nvpair_name(nvp); data_type_t type = nvpair_type(nvp); fs_perm_t *fsperm = NULL; fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t)); if (node == NULL) nomem(); fsperm = &node->fspn_fsperm; VERIFY(DATA_TYPE_NVLIST == type); uu_list_node_init(node, &node->fspn_list_node, fspset->fsps_list_pool); idx = uu_list_numnodes(fspset->fsps_list); fs_perm_init(fsperm, fspset, fsname); if (nvpair_value_nvlist(nvp, &nvl2) != 0) return (-1); (void) parse_fs_perm(fsperm, nvl2); uu_list_insert(fspset->fsps_list, node, idx); } return (0); } static inline const char * deleg_perm_comment(zfs_deleg_note_t note) { const char *str = ""; /* subcommands */ switch (note) { /* SUBCOMMANDS */ case ZFS_DELEG_NOTE_ALLOW: str = gettext("Must also have the permission that is being" "\n\t\t\t\tallowed"); break; case ZFS_DELEG_NOTE_CLONE: str = gettext("Must also have the 'create' ability and 'mount'" "\n\t\t\t\tability in the origin file system"); break; case ZFS_DELEG_NOTE_CREATE: str = gettext("Must also have the 'mount' ability"); break; case ZFS_DELEG_NOTE_DESTROY: str = gettext("Must also have the 'mount' ability"); break; case ZFS_DELEG_NOTE_DIFF: str = gettext("Allows lookup of paths within a dataset;" "\n\t\t\t\tgiven an object number. Ordinary users need this" "\n\t\t\t\tin order to use zfs diff"); break; case ZFS_DELEG_NOTE_HOLD: str = gettext("Allows adding a user hold to a snapshot"); break; case ZFS_DELEG_NOTE_MOUNT: str = gettext("Allows mount/umount of ZFS datasets"); break; case ZFS_DELEG_NOTE_PROMOTE: str = gettext("Must also have the 'mount'\n\t\t\t\tand" " 'promote' ability in the origin file system"); break; case ZFS_DELEG_NOTE_RECEIVE: str = gettext("Must also have the 'mount' and 'create'" " ability"); break; case ZFS_DELEG_NOTE_RELEASE: str = gettext("Allows releasing a user hold which\n\t\t\t\t" "might destroy the snapshot"); break; case ZFS_DELEG_NOTE_RENAME: str = gettext("Must also have the 'mount' and 'create'" "\n\t\t\t\tability in the new parent"); break; case ZFS_DELEG_NOTE_ROLLBACK: str = gettext(""); break; case ZFS_DELEG_NOTE_SEND: str = gettext(""); break; case ZFS_DELEG_NOTE_SHARE: str = gettext("Allows sharing file systems over NFS or SMB" "\n\t\t\t\tprotocols"); break; case ZFS_DELEG_NOTE_SNAPSHOT: str = gettext(""); break; case ZFS_DELEG_NOTE_LOAD_KEY: str = gettext("Allows loading or unloading an encryption key"); break; case ZFS_DELEG_NOTE_CHANGE_KEY: str = gettext("Allows changing or adding an encryption key"); break; /* * case ZFS_DELEG_NOTE_VSCAN: * str = gettext(""); * break; */ /* OTHER */ case ZFS_DELEG_NOTE_GROUPQUOTA: str = gettext("Allows accessing any groupquota@... property"); break; case ZFS_DELEG_NOTE_GROUPUSED: str = gettext("Allows reading any groupused@... property"); break; case ZFS_DELEG_NOTE_USERPROP: str = gettext("Allows changing any user property"); break; case ZFS_DELEG_NOTE_USERQUOTA: str = gettext("Allows accessing any userquota@... property"); break; case ZFS_DELEG_NOTE_USERUSED: str = gettext("Allows reading any userused@... property"); break; case ZFS_DELEG_NOTE_USEROBJQUOTA: str = gettext("Allows accessing any userobjquota@... property"); break; case ZFS_DELEG_NOTE_GROUPOBJQUOTA: str = gettext("Allows accessing any \n\t\t\t\t" "groupobjquota@... property"); break; case ZFS_DELEG_NOTE_GROUPOBJUSED: str = gettext("Allows reading any groupobjused@... property"); break; case ZFS_DELEG_NOTE_USEROBJUSED: str = gettext("Allows reading any userobjused@... property"); break; case ZFS_DELEG_NOTE_PROJECTQUOTA: str = gettext("Allows accessing any projectquota@... property"); break; case ZFS_DELEG_NOTE_PROJECTOBJQUOTA: str = gettext("Allows accessing any \n\t\t\t\t" "projectobjquota@... property"); break; case ZFS_DELEG_NOTE_PROJECTUSED: str = gettext("Allows reading any projectused@... property"); break; case ZFS_DELEG_NOTE_PROJECTOBJUSED: str = gettext("Allows accessing any \n\t\t\t\t" "projectobjused@... property"); break; /* other */ default: str = ""; } return (str); } struct allow_opts { boolean_t local; boolean_t descend; boolean_t user; boolean_t group; boolean_t everyone; boolean_t create; boolean_t set; boolean_t recursive; /* unallow only */ boolean_t prt_usage; boolean_t prt_perms; char *who; char *perms; const char *dataset; }; static inline int prop_cmp(const void *a, const void *b) { const char *str1 = *(const char **)a; const char *str2 = *(const char **)b; return (strcmp(str1, str2)); } static void allow_usage(boolean_t un, boolean_t requested, const char *msg) { const char *opt_desc[] = { "-h", gettext("show this help message and exit"), "-l", gettext("set permission locally"), "-d", gettext("set permission for descents"), "-u", gettext("set permission for user"), "-g", gettext("set permission for group"), "-e", gettext("set permission for everyone"), "-c", gettext("set create time permission"), "-s", gettext("define permission set"), /* unallow only */ "-r", gettext("remove permissions recursively"), }; size_t unallow_size = sizeof (opt_desc) / sizeof (char *); size_t allow_size = unallow_size - 2; const char *props[ZFS_NUM_PROPS]; int i; size_t count = 0; FILE *fp = requested ? stdout : stderr; zprop_desc_t *pdtbl = zfs_prop_get_table(); const char *fmt = gettext("%-16s %-14s\t%s\n"); (void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW : HELP_ALLOW)); (void) fprintf(fp, gettext("Options:\n")); for (i = 0; i < (un ? unallow_size : allow_size); i += 2) { const char *opt = opt_desc[i]; const char *optdsc = opt_desc[i + 1]; (void) fprintf(fp, gettext(" %-10s %s\n"), opt, optdsc); } (void) fprintf(fp, gettext("\nThe following permissions are " "supported:\n\n")); (void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"), gettext("NOTES")); for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) { const char *perm_name = zfs_deleg_perm_tbl[i].z_perm; zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note; const char *perm_type = deleg_perm_type(perm_note); const char *perm_comment = deleg_perm_comment(perm_note); (void) fprintf(fp, fmt, perm_name, perm_type, perm_comment); } for (i = 0; i < ZFS_NUM_PROPS; i++) { zprop_desc_t *pd = &pdtbl[i]; if (pd->pd_visible != B_TRUE) continue; if (pd->pd_attr == PROP_READONLY) continue; props[count++] = pd->pd_name; } props[count] = NULL; qsort(props, count, sizeof (char *), prop_cmp); for (i = 0; i < count; i++) (void) fprintf(fp, fmt, props[i], gettext("property"), ""); if (msg != NULL) (void) fprintf(fp, gettext("\nzfs: error: %s"), msg); exit(requested ? 0 : 2); } static inline const char * munge_args(int argc, char **argv, boolean_t un, size_t expected_argc, char **permsp) { if (un && argc == expected_argc - 1) *permsp = NULL; else if (argc == expected_argc) *permsp = argv[argc - 2]; else allow_usage(un, B_FALSE, gettext("wrong number of parameters\n")); return (argv[argc - 1]); } static void parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts) { int uge_sum = opts->user + opts->group + opts->everyone; int csuge_sum = opts->create + opts->set + uge_sum; int ldcsuge_sum = csuge_sum + opts->local + opts->descend; int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum; if (uge_sum > 1) allow_usage(un, B_FALSE, gettext("-u, -g, and -e are mutually exclusive\n")); if (opts->prt_usage) { if (argc == 0 && all_sum == 0) allow_usage(un, B_TRUE, NULL); else usage(B_FALSE); } if (opts->set) { if (csuge_sum > 1) allow_usage(un, B_FALSE, gettext("invalid options combined with -s\n")); opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); if (argv[0][0] != '@') allow_usage(un, B_FALSE, gettext("invalid set name: missing '@' prefix\n")); opts->who = argv[0]; } else if (opts->create) { if (ldcsuge_sum > 1) allow_usage(un, B_FALSE, gettext("invalid options combined with -c\n")); opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); } else if (opts->everyone) { if (csuge_sum > 1) allow_usage(un, B_FALSE, gettext("invalid options combined with -e\n")); opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); } else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone") == 0) { opts->everyone = B_TRUE; argc--; argv++; opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); } else if (argc == 1 && !un) { opts->prt_perms = B_TRUE; opts->dataset = argv[argc-1]; } else { opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); opts->who = argv[0]; } if (!opts->local && !opts->descend) { opts->local = B_TRUE; opts->descend = B_TRUE; } } static void store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend, const char *who, char *perms, nvlist_t *top_nvl) { int i; char ld[2] = { '\0', '\0' }; char who_buf[MAXNAMELEN + 32]; char base_type = '\0'; char set_type = '\0'; nvlist_t *base_nvl = NULL; nvlist_t *set_nvl = NULL; nvlist_t *nvl; if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); switch (type) { case ZFS_DELEG_NAMED_SET_SETS: case ZFS_DELEG_NAMED_SET: set_type = ZFS_DELEG_NAMED_SET_SETS; base_type = ZFS_DELEG_NAMED_SET; ld[0] = ZFS_DELEG_NA; break; case ZFS_DELEG_CREATE_SETS: case ZFS_DELEG_CREATE: set_type = ZFS_DELEG_CREATE_SETS; base_type = ZFS_DELEG_CREATE; ld[0] = ZFS_DELEG_NA; break; case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: set_type = ZFS_DELEG_USER_SETS; base_type = ZFS_DELEG_USER; if (local) ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: set_type = ZFS_DELEG_GROUP_SETS; base_type = ZFS_DELEG_GROUP; if (local) ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; break; case ZFS_DELEG_EVERYONE_SETS: case ZFS_DELEG_EVERYONE: set_type = ZFS_DELEG_EVERYONE_SETS; base_type = ZFS_DELEG_EVERYONE; if (local) ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; break; default: assert(set_type != '\0' && base_type != '\0'); } if (perms != NULL) { char *curr = perms; char *end = curr + strlen(perms); while (curr < end) { char *delim = strchr(curr, ','); if (delim == NULL) delim = end; else *delim = '\0'; if (curr[0] == '@') nvl = set_nvl; else nvl = base_nvl; (void) nvlist_add_boolean(nvl, curr); if (delim != end) *delim = ','; curr = delim + 1; } for (i = 0; i < 2; i++) { char locality = ld[i]; if (locality == 0) continue; if (!nvlist_empty(base_nvl)) { if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", base_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", base_type, locality); (void) nvlist_add_nvlist(top_nvl, who_buf, base_nvl); } if (!nvlist_empty(set_nvl)) { if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", set_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", set_type, locality); (void) nvlist_add_nvlist(top_nvl, who_buf, set_nvl); } } } else { for (i = 0; i < 2; i++) { char locality = ld[i]; if (locality == 0) continue; if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", base_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", base_type, locality); (void) nvlist_add_boolean(top_nvl, who_buf); if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", set_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", set_type, locality); (void) nvlist_add_boolean(top_nvl, who_buf); } } } static int construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) { if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0) nomem(); if (opts->set) { store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local, opts->descend, opts->who, opts->perms, *nvlp); } else if (opts->create) { store_allow_perm(ZFS_DELEG_CREATE, opts->local, opts->descend, NULL, opts->perms, *nvlp); } else if (opts->everyone) { store_allow_perm(ZFS_DELEG_EVERYONE, opts->local, opts->descend, NULL, opts->perms, *nvlp); } else { char *curr = opts->who; char *end = curr + strlen(curr); while (curr < end) { const char *who; zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN; char *endch; char *delim = strchr(curr, ','); char errbuf[256]; char id[64]; struct passwd *p = NULL; struct group *g = NULL; uid_t rid; if (delim == NULL) delim = end; else *delim = '\0'; rid = (uid_t)strtol(curr, &endch, 0); if (opts->user) { who_type = ZFS_DELEG_USER; if (*endch != '\0') p = getpwnam(curr); else p = getpwuid(rid); if (p != NULL) rid = p->pw_uid; else if (*endch != '\0') { (void) snprintf(errbuf, 256, gettext( "invalid user %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } else if (opts->group) { who_type = ZFS_DELEG_GROUP; if (*endch != '\0') g = getgrnam(curr); else g = getgrgid(rid); if (g != NULL) rid = g->gr_gid; else if (*endch != '\0') { (void) snprintf(errbuf, 256, gettext( "invalid group %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } else { if (*endch != '\0') { p = getpwnam(curr); } else { p = getpwuid(rid); } if (p == NULL) { if (*endch != '\0') { g = getgrnam(curr); } else { g = getgrgid(rid); } } if (p != NULL) { who_type = ZFS_DELEG_USER; rid = p->pw_uid; } else if (g != NULL) { who_type = ZFS_DELEG_GROUP; rid = g->gr_gid; } else { (void) snprintf(errbuf, 256, gettext( "invalid user/group %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } (void) sprintf(id, "%u", rid); who = id; store_allow_perm(who_type, opts->local, opts->descend, who, opts->perms, *nvlp); curr = delim + 1; } } return (0); } static void print_set_creat_perms(uu_avl_t *who_avl) { const char *sc_title[] = { gettext("Permission sets:\n"), gettext("Create time permissions:\n"), NULL }; who_perm_node_t *who_node = NULL; int prev_weight = -1; for (who_node = uu_avl_first(who_avl); who_node != NULL; who_node = uu_avl_next(who_avl, who_node)) { uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; const char *who_name = who_node->who_perm.who_name; int weight = who_type2weight(who_type); boolean_t first = B_TRUE; deleg_perm_node_t *deleg_node; if (prev_weight != weight) { (void) printf("%s", sc_title[weight]); prev_weight = weight; } if (who_name == NULL || strnlen(who_name, 1) == 0) (void) printf("\t"); else (void) printf("\t%s ", who_name); for (deleg_node = uu_avl_first(avl); deleg_node != NULL; deleg_node = uu_avl_next(avl, deleg_node)) { if (first) { (void) printf("%s", deleg_node->dpn_perm.dp_name); first = B_FALSE; } else (void) printf(",%s", deleg_node->dpn_perm.dp_name); } (void) printf("\n"); } } static void print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend, const char *title) { who_perm_node_t *who_node = NULL; boolean_t prt_title = B_TRUE; uu_avl_walk_t *walk; if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL) nomem(); while ((who_node = uu_avl_walk_next(walk)) != NULL) { const char *who_name = who_node->who_perm.who_name; const char *nice_who_name = who_node->who_perm.who_ug_name; uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; char delim = ' '; deleg_perm_node_t *deleg_node; boolean_t prt_who = B_TRUE; for (deleg_node = uu_avl_first(avl); deleg_node != NULL; deleg_node = uu_avl_next(avl, deleg_node)) { if (local != deleg_node->dpn_perm.dp_local || descend != deleg_node->dpn_perm.dp_descend) continue; if (prt_who) { const char *who = NULL; if (prt_title) { prt_title = B_FALSE; (void) printf("%s", title); } switch (who_type) { case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: who = gettext("user"); if (nice_who_name) who_name = nice_who_name; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: who = gettext("group"); if (nice_who_name) who_name = nice_who_name; break; case ZFS_DELEG_EVERYONE_SETS: case ZFS_DELEG_EVERYONE: who = gettext("everyone"); who_name = NULL; break; default: assert(who != NULL); } prt_who = B_FALSE; if (who_name == NULL) (void) printf("\t%s", who); else (void) printf("\t%s %s", who, who_name); } (void) printf("%c%s", delim, deleg_node->dpn_perm.dp_name); delim = ','; } if (!prt_who) (void) printf("\n"); } uu_avl_walk_end(walk); } static void print_fs_perms(fs_perm_set_t *fspset) { fs_perm_node_t *node = NULL; char buf[MAXNAMELEN + 32]; const char *dsname = buf; for (node = uu_list_first(fspset->fsps_list); node != NULL; node = uu_list_next(fspset->fsps_list, node)) { uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl; uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl; int left = 0; (void) snprintf(buf, sizeof (buf), gettext("---- Permissions on %s "), node->fspn_fsperm.fsp_name); (void) printf("%s", dsname); left = 70 - strlen(buf); while (left-- > 0) (void) printf("-"); (void) printf("\n"); print_set_creat_perms(sc_avl); print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE, gettext("Local permissions:\n")); print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE, gettext("Descendent permissions:\n")); print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE, gettext("Local+Descendent permissions:\n")); } } static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL }; struct deleg_perms { boolean_t un; nvlist_t *nvl; }; static int set_deleg_perms(zfs_handle_t *zhp, void *data) { struct deleg_perms *perms = (struct deleg_perms *)data; zfs_type_t zfs_type = zfs_get_type(zhp); if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME) return (0); return (zfs_set_fsacl(zhp, perms->un, perms->nvl)); } static int zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un) { zfs_handle_t *zhp; nvlist_t *perm_nvl = NULL; nvlist_t *update_perm_nvl = NULL; int error = 1; int c; struct allow_opts opts = { 0 }; const char *optstr = un ? "ldugecsrh" : "ldugecsh"; /* check opts */ while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'l': opts.local = B_TRUE; break; case 'd': opts.descend = B_TRUE; break; case 'u': opts.user = B_TRUE; break; case 'g': opts.group = B_TRUE; break; case 'e': opts.everyone = B_TRUE; break; case 's': opts.set = B_TRUE; break; case 'c': opts.create = B_TRUE; break; case 'r': opts.recursive = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case 'h': opts.prt_usage = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check arguments */ parse_allow_args(argc, argv, un, &opts); /* try to open the dataset */ if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { (void) fprintf(stderr, "Failed to open dataset: %s\n", opts.dataset); return (-1); } if (zfs_get_fsacl(zhp, &perm_nvl) != 0) goto cleanup2; fs_perm_set_init(&fs_perm_set); if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) { (void) fprintf(stderr, "Failed to parse fsacl permissions\n"); goto cleanup1; } if (opts.prt_perms) print_fs_perms(&fs_perm_set); else { (void) construct_fsacl_list(un, &opts, &update_perm_nvl); if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0) goto cleanup0; if (un && opts.recursive) { struct deleg_perms data = { un, update_perm_nvl }; if (zfs_iter_filesystems(zhp, set_deleg_perms, &data) != 0) goto cleanup0; } } error = 0; cleanup0: nvlist_free(perm_nvl); nvlist_free(update_perm_nvl); cleanup1: fs_perm_set_fini(&fs_perm_set); cleanup2: zfs_close(zhp); return (error); } static int zfs_do_allow(int argc, char **argv) { return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE)); } static int zfs_do_unallow(int argc, char **argv) { return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE)); } static int zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) { int errors = 0; int i; const char *tag; boolean_t recursive = B_FALSE; const char *opts = holding ? "rt" : "r"; int c; /* check options */ while ((c = getopt(argc, argv, opts)) != -1) { switch (c) { case 'r': recursive = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 2) usage(B_FALSE); tag = argv[0]; --argc; ++argv; if (holding && tag[0] == '.') { /* tags starting with '.' are reserved for libzfs */ (void) fprintf(stderr, gettext("tag may not start with '.'\n")); usage(B_FALSE); } for (i = 0; i < argc; ++i) { zfs_handle_t *zhp; char parent[ZFS_MAX_DATASET_NAME_LEN]; const char *delim; char *path = argv[i]; delim = strchr(path, '@'); if (delim == NULL) { (void) fprintf(stderr, gettext("'%s' is not a snapshot\n"), path); ++errors; continue; } (void) strncpy(parent, path, delim - path); parent[delim - path] = '\0'; zhp = zfs_open(g_zfs, parent, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { ++errors; continue; } if (holding) { if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0) ++errors; } else { if (zfs_release(zhp, delim+1, tag, recursive) != 0) ++errors; } zfs_close(zhp); } return (errors != 0); } /* * zfs hold [-r] [-t] ... * * -r Recursively hold * * Apply a user-hold with the given tag to the list of snapshots. */ static int zfs_do_hold(int argc, char **argv) { return (zfs_do_hold_rele_impl(argc, argv, B_TRUE)); } /* * zfs release [-r] ... * * -r Recursively release * * Release a user-hold with the given tag from the list of snapshots. */ static int zfs_do_release(int argc, char **argv) { return (zfs_do_hold_rele_impl(argc, argv, B_FALSE)); } typedef struct holds_cbdata { boolean_t cb_recursive; const char *cb_snapname; nvlist_t **cb_nvlp; size_t cb_max_namelen; size_t cb_max_taglen; } holds_cbdata_t; #define STRFTIME_FMT_STR "%a %b %e %H:%M %Y" #define DATETIME_BUF_LEN (32) /* * */ static void print_holds(boolean_t scripted, int nwidth, int tagwidth, nvlist_t *nvl) { int i; nvpair_t *nvp = NULL; char *hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" }; const char *col; if (!scripted) { for (i = 0; i < 3; i++) { col = gettext(hdr_cols[i]); if (i < 2) (void) printf("%-*s ", i ? tagwidth : nwidth, col); else (void) printf("%s\n", col); } } while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { char *zname = nvpair_name(nvp); nvlist_t *nvl2; nvpair_t *nvp2 = NULL; (void) nvpair_value_nvlist(nvp, &nvl2); while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) { char tsbuf[DATETIME_BUF_LEN]; char *tagname = nvpair_name(nvp2); uint64_t val = 0; time_t time; struct tm t; (void) nvpair_value_uint64(nvp2, &val); time = (time_t)val; (void) localtime_r(&time, &t); (void) strftime(tsbuf, DATETIME_BUF_LEN, gettext(STRFTIME_FMT_STR), &t); if (scripted) { (void) printf("%s\t%s\t%s\n", zname, tagname, tsbuf); } else { (void) printf("%-*s %-*s %s\n", nwidth, zname, tagwidth, tagname, tsbuf); } } } } /* * Generic callback function to list a dataset or snapshot. */ static int holds_callback(zfs_handle_t *zhp, void *data) { holds_cbdata_t *cbp = data; nvlist_t *top_nvl = *cbp->cb_nvlp; nvlist_t *nvl = NULL; nvpair_t *nvp = NULL; const char *zname = zfs_get_name(zhp); size_t znamelen = strlen(zname); if (cbp->cb_recursive) { const char *snapname; char *delim = strchr(zname, '@'); if (delim == NULL) return (0); snapname = delim + 1; if (strcmp(cbp->cb_snapname, snapname)) return (0); } if (zfs_get_holds(zhp, &nvl) != 0) return (-1); if (znamelen > cbp->cb_max_namelen) cbp->cb_max_namelen = znamelen; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { const char *tag = nvpair_name(nvp); size_t taglen = strlen(tag); if (taglen > cbp->cb_max_taglen) cbp->cb_max_taglen = taglen; } return (nvlist_add_nvlist(top_nvl, zname, nvl)); } /* * zfs holds [-rH] ... * * -r Lists holds that are set on the named snapshots recursively. * -H Scripted mode; elide headers and separate columns by tabs. */ static int zfs_do_holds(int argc, char **argv) { int errors = 0; int c; int i; boolean_t scripted = B_FALSE; boolean_t recursive = B_FALSE; const char *opts = "rH"; nvlist_t *nvl; int types = ZFS_TYPE_SNAPSHOT; holds_cbdata_t cb = { 0 }; int limit = 0; int ret = 0; int flags = 0; /* check options */ while ((c = getopt(argc, argv, opts)) != -1) { switch (c) { case 'r': recursive = B_TRUE; break; case 'H': scripted = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (recursive) { types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; flags |= ZFS_ITER_RECURSE; } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) usage(B_FALSE); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); for (i = 0; i < argc; ++i) { char *snapshot = argv[i]; const char *delim; const char *snapname; delim = strchr(snapshot, '@'); if (delim == NULL) { (void) fprintf(stderr, gettext("'%s' is not a snapshot\n"), snapshot); ++errors; continue; } snapname = delim + 1; if (recursive) snapshot[delim - snapshot] = '\0'; cb.cb_recursive = recursive; cb.cb_snapname = snapname; cb.cb_nvlp = &nvl; /* * 1. collect holds data, set format options */ ret = zfs_for_each(argc, argv, flags, types, NULL, NULL, limit, holds_callback, &cb); if (ret != 0) ++errors; } /* * 2. print holds data */ print_holds(scripted, cb.cb_max_namelen, cb.cb_max_taglen, nvl); if (nvlist_empty(nvl)) (void) fprintf(stderr, gettext("no datasets available\n")); nvlist_free(nvl); return (0 != errors); } #define CHECK_SPINNER 30 #define SPINNER_TIME 3 /* seconds */ #define MOUNT_TIME 1 /* seconds */ typedef struct get_all_state { boolean_t ga_verbose; get_all_cb_t *ga_cbp; } get_all_state_t; static int get_one_dataset(zfs_handle_t *zhp, void *data) { static char *spin[] = { "-", "\\", "|", "/" }; static int spinval = 0; static int spincheck = 0; static time_t last_spin_time = (time_t)0; get_all_state_t *state = data; zfs_type_t type = zfs_get_type(zhp); if (state->ga_verbose) { if (--spincheck < 0) { time_t now = time(NULL); if (last_spin_time + SPINNER_TIME < now) { update_progress(spin[spinval++ % 4]); last_spin_time = now; } spincheck = CHECK_SPINNER; } } /* * Iterate over any nested datasets. */ if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) { zfs_close(zhp); return (1); } /* * Skip any datasets whose type does not match. */ if ((type & ZFS_TYPE_FILESYSTEM) == 0) { zfs_close(zhp); return (0); } libzfs_add_handle(state->ga_cbp, zhp); assert(state->ga_cbp->cb_used <= state->ga_cbp->cb_alloc); return (0); } static void get_all_datasets(get_all_cb_t *cbp, boolean_t verbose) { get_all_state_t state = { .ga_verbose = verbose, .ga_cbp = cbp }; if (verbose) set_progress_header(gettext("Reading ZFS config")); (void) zfs_iter_root(g_zfs, get_one_dataset, &state); if (verbose) finish_progress(gettext("done.")); } /* * Generic callback for sharing or mounting filesystems. Because the code is so * similar, we have a common function with an extra parameter to determine which * mode we are using. */ typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t; typedef struct share_mount_state { share_mount_op_t sm_op; boolean_t sm_verbose; int sm_flags; char *sm_options; char *sm_proto; /* only valid for OP_SHARE */ pthread_mutex_t sm_lock; /* protects the remaining fields */ uint_t sm_total; /* number of filesystems to process */ uint_t sm_done; /* number of filesystems processed */ int sm_status; /* -1 if any of the share/mount operations failed */ } share_mount_state_t; /* * Share or mount a dataset. */ static int share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, boolean_t explicit, const char *options) { char mountpoint[ZFS_MAXPROPLEN]; char shareopts[ZFS_MAXPROPLEN]; char smbshareopts[ZFS_MAXPROPLEN]; const char *cmdname = op == OP_SHARE ? "share" : "mount"; struct mnttab mnt; uint64_t zoned, canmount; boolean_t shared_nfs, shared_smb; assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM); /* * Check to make sure we can mount/share this dataset. If we * are in the global zone and the filesystem is exported to a * local zone, or if we are in a local zone and the * filesystem is not exported, then it is an error. */ zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); if (zoned && getzoneid() == GLOBAL_ZONEID) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "dataset is exported to a local zone\n"), cmdname, zfs_get_name(zhp)); return (1); } else if (!zoned && getzoneid() != GLOBAL_ZONEID) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "permission denied\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * Ignore any filesystems which don't apply to us. This * includes those with a legacy mountpoint, or those with * legacy share options. */ verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts, sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts, sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0); if (op == OP_SHARE && strcmp(shareopts, "off") == 0 && strcmp(smbshareopts, "off") == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot share '%s': " "legacy share\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use share(1M) to " "share this filesystem, or set " "sharenfs property on\n")); return (1); } /* * We cannot share or mount legacy filesystems. If the * shareopts is non-legacy but the mountpoint is legacy, we * treat it as a legacy share. */ if (strcmp(mountpoint, "legacy") == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "legacy mountpoint\n"), cmdname, zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use %s(1M) to " "%s this filesystem\n"), cmdname, cmdname); return (1); } if (strcmp(mountpoint, "none") == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': no " "mountpoint set\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * canmount explicit outcome * on no pass through * on yes pass through * off no return 0 * off yes display error, return 1 * noauto no return 0 * noauto yes pass through */ canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT); if (canmount == ZFS_CANMOUNT_OFF) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "'canmount' property is set to 'off'\n"), cmdname, zfs_get_name(zhp)); return (1); } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) { /* * When performing a 'zfs mount -a', we skip any mounts for * datasets that have 'noauto' set. Sharing a dataset with * 'noauto' set is only allowed if it's mounted. */ if (op == OP_MOUNT) return (0); if (op == OP_SHARE && !zfs_is_mounted(zhp, NULL)) { /* also purge it from existing exports */ zfs_unshareall_bypath(zhp, mountpoint); return (0); } } /* * If this filesystem is encrypted and does not have * a loaded key, we can not mount it. */ if ((flags & MS_CRYPT) == 0 && zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF && zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) == ZFS_KEYSTATUS_UNAVAILABLE) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "encryption key not loaded\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * If this filesystem is inconsistent and has a receive resume * token, we can not mount it. */ if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0, NULL, NULL, 0, B_TRUE) == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "Contains partially-completed state from " "\"zfs receive -s\", which can be resumed with " "\"zfs send -t\"\n"), cmdname, zfs_get_name(zhp)); return (1); } if (zfs_prop_get_int(zhp, ZFS_PROP_REDACTED) && !(flags & MS_FORCE)) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "Dataset is not complete, was created by receiving " "a redacted zfs send stream.\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * At this point, we have verified that the mountpoint and/or * shareopts are appropriate for auto management. If the * filesystem is already mounted or shared, return (failing * for explicit requests); otherwise mount or share the * filesystem. */ switch (op) { case OP_SHARE: shared_nfs = zfs_is_shared_nfs(zhp, NULL); shared_smb = zfs_is_shared_smb(zhp, NULL); if ((shared_nfs && shared_smb) || (shared_nfs && strcmp(shareopts, "on") == 0 && strcmp(smbshareopts, "off") == 0) || (shared_smb && strcmp(smbshareopts, "on") == 0 && strcmp(shareopts, "off") == 0)) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot share " "'%s': filesystem already shared\n"), zfs_get_name(zhp)); return (1); } if (!zfs_is_mounted(zhp, NULL) && zfs_mount(zhp, NULL, flags) != 0) return (1); if (protocol == NULL) { if (zfs_shareall(zhp) != 0) return (1); } else if (strcmp(protocol, "nfs") == 0) { if (zfs_share_nfs(zhp)) return (1); } else if (strcmp(protocol, "smb") == 0) { if (zfs_share_smb(zhp)) return (1); } else { (void) fprintf(stderr, gettext("cannot share " "'%s': invalid share type '%s' " "specified\n"), zfs_get_name(zhp), protocol); return (1); } break; case OP_MOUNT: if (options == NULL) mnt.mnt_mntopts = ""; else mnt.mnt_mntopts = (char *)options; if (!hasmntopt(&mnt, MNTOPT_REMOUNT) && zfs_is_mounted(zhp, NULL)) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot mount " "'%s': filesystem already mounted\n"), zfs_get_name(zhp)); return (1); } if (zfs_mount(zhp, options, flags) != 0) return (1); break; } return (0); } /* * Reports progress in the form "(current/total)". Not thread-safe. */ static void report_mount_progress(int current, int total) { static time_t last_progress_time = 0; time_t now = time(NULL); char info[32]; /* report 1..n instead of 0..n-1 */ ++current; /* display header if we're here for the first time */ if (current == 1) { set_progress_header(gettext("Mounting ZFS filesystems")); } else if (current != total && last_progress_time + MOUNT_TIME >= now) { /* too soon to report again */ return; } last_progress_time = now; (void) sprintf(info, "(%d/%d)", current, total); if (current == total) finish_progress(info); else update_progress(info); } /* * zfs_foreach_mountpoint() callback that mounts or shares one filesystem and * updates the progress meter. */ static int share_mount_one_cb(zfs_handle_t *zhp, void *arg) { share_mount_state_t *sms = arg; int ret; ret = share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto, B_FALSE, sms->sm_options); pthread_mutex_lock(&sms->sm_lock); if (ret != 0) sms->sm_status = ret; sms->sm_done++; if (sms->sm_verbose) report_mount_progress(sms->sm_done, sms->sm_total); pthread_mutex_unlock(&sms->sm_lock); return (ret); } static void append_options(char *mntopts, char *newopts) { int len = strlen(mntopts); /* original length plus new string to append plus 1 for the comma */ if (len + 1 + strlen(newopts) >= MNT_LINE_MAX) { (void) fprintf(stderr, gettext("the opts argument for " "'%s' option is too long (more than %d chars)\n"), "-o", MNT_LINE_MAX); usage(B_FALSE); } if (*mntopts) mntopts[len++] = ','; (void) strcpy(&mntopts[len], newopts); } static int share_mount(int op, int argc, char **argv) { int do_all = 0; boolean_t verbose = B_FALSE; int c, ret = 0; char *options = NULL; int flags = 0; /* check options */ while ((c = getopt(argc, argv, op == OP_MOUNT ? ":alvo:Of" : "al")) != -1) { switch (c) { case 'a': do_all = 1; break; case 'v': verbose = B_TRUE; break; case 'l': flags |= MS_CRYPT; break; case 'o': if (*optarg == '\0') { (void) fprintf(stderr, gettext("empty mount " "options (-o) specified\n")); usage(B_FALSE); } if (options == NULL) options = safe_malloc(MNT_LINE_MAX + 1); /* option validation is done later */ append_options(options, optarg); break; case 'O': flags |= MS_OVERLAY; break; case 'f': flags |= MS_FORCE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (do_all) { char *protocol = NULL; if (op == OP_SHARE && argc > 0) { if (strcmp(argv[0], "nfs") != 0 && strcmp(argv[0], "smb") != 0) { (void) fprintf(stderr, gettext("share type " "must be 'nfs' or 'smb'\n")); usage(B_FALSE); } protocol = argv[0]; argc--; argv++; } if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } start_progress_timer(); get_all_cb_t cb = { 0 }; get_all_datasets(&cb, verbose); if (cb.cb_used == 0) { if (options != NULL) free(options); return (0); } share_mount_state_t share_mount_state = { 0 }; share_mount_state.sm_op = op; share_mount_state.sm_verbose = verbose; share_mount_state.sm_flags = flags; share_mount_state.sm_options = options; share_mount_state.sm_proto = protocol; share_mount_state.sm_total = cb.cb_used; pthread_mutex_init(&share_mount_state.sm_lock, NULL); /* * libshare isn't mt-safe, so only do the operation in parallel * if we're mounting. Additionally, the key-loading option must * be serialized so that we can prompt the user for their keys * in a consistent manner. */ zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used, share_mount_one_cb, &share_mount_state, op == OP_MOUNT && !(flags & MS_CRYPT)); zfs_commit_all_shares(); ret = share_mount_state.sm_status; for (int i = 0; i < cb.cb_used; i++) zfs_close(cb.cb_handles[i]); free(cb.cb_handles); } else if (argc == 0) { struct mnttab entry; if ((op == OP_SHARE) || (options != NULL)) { (void) fprintf(stderr, gettext("missing filesystem " "argument (specify -a for all)\n")); usage(B_FALSE); } /* * When mount is given no arguments, go through * /proc/self/mounts and display any active ZFS mounts. * We hide any snapshots, since they are controlled * automatically. */ /* Reopen MNTTAB to prevent reading stale data from open file */ if (freopen(MNTTAB, "r", mnttab_file) == NULL) { if (options != NULL) free(options); return (ENOENT); } while (getmntent(mnttab_file, &entry) == 0) { if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 || strchr(entry.mnt_special, '@') != NULL) continue; (void) printf("%-30s %s\n", entry.mnt_special, entry.mnt_mountp); } } else { zfs_handle_t *zhp; if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM)) == NULL) { ret = 1; } else { ret = share_mount_one(zhp, op, flags, NULL, B_TRUE, options); zfs_commit_all_shares(); zfs_close(zhp); } } if (options != NULL) free(options); return (ret); } /* * zfs mount -a [nfs] * zfs mount filesystem * * Mount all filesystems, or mount the given filesystem. */ static int zfs_do_mount(int argc, char **argv) { return (share_mount(OP_MOUNT, argc, argv)); } /* * zfs share -a [nfs | smb] * zfs share filesystem * * Share all filesystems, or share the given filesystem. */ static int zfs_do_share(int argc, char **argv) { return (share_mount(OP_SHARE, argc, argv)); } typedef struct unshare_unmount_node { zfs_handle_t *un_zhp; char *un_mountp; uu_avl_node_t un_avlnode; } unshare_unmount_node_t; /* ARGSUSED */ static int unshare_unmount_compare(const void *larg, const void *rarg, void *unused) { const unshare_unmount_node_t *l = larg; const unshare_unmount_node_t *r = rarg; return (strcmp(l->un_mountp, r->un_mountp)); } /* * Convenience routine used by zfs_do_umount() and manual_unmount(). Given an * absolute path, find the entry /proc/self/mounts, verify that it's a * ZFS filesystem, and unmount it appropriately. */ static int unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) { zfs_handle_t *zhp; int ret = 0; struct stat64 statbuf; struct extmnttab entry; const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount"; ino_t path_inode; /* * Search for the given (major,minor) pair in the mount table. */ /* Reopen MNTTAB to prevent reading stale data from open file */ if (freopen(MNTTAB, "r", mnttab_file) == NULL) return (ENOENT); if (getextmntent(path, &entry, &statbuf) != 0) { if (op == OP_SHARE) { (void) fprintf(stderr, gettext("cannot %s '%s': not " "currently mounted\n"), cmdname, path); return (1); } (void) fprintf(stderr, gettext("warning: %s not in" "/proc/self/mounts\n"), path); if ((ret = umount2(path, flags)) != 0) (void) fprintf(stderr, gettext("%s: %s\n"), path, strerror(errno)); return (ret != 0); } path_inode = statbuf.st_ino; if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { (void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS " "filesystem\n"), cmdname, path); return (1); } if ((zhp = zfs_open(g_zfs, entry.mnt_special, ZFS_TYPE_FILESYSTEM)) == NULL) return (1); ret = 1; if (stat64(entry.mnt_mountp, &statbuf) != 0) { (void) fprintf(stderr, gettext("cannot %s '%s': %s\n"), cmdname, path, strerror(errno)); goto out; } else if (statbuf.st_ino != path_inode) { (void) fprintf(stderr, gettext("cannot " "%s '%s': not a mountpoint\n"), cmdname, path); goto out; } if (op == OP_SHARE) { char nfs_mnt_prop[ZFS_MAXPROPLEN]; char smbshare_prop[ZFS_MAXPROPLEN]; verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshare_prop, sizeof (smbshare_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") == 0 && strcmp(smbshare_prop, "off") == 0) { (void) fprintf(stderr, gettext("cannot unshare " "'%s': legacy share\n"), path); (void) fprintf(stderr, gettext("use exportfs(8) " "or smbcontrol(1) to unshare this filesystem\n")); } else if (!zfs_is_shared(zhp)) { (void) fprintf(stderr, gettext("cannot unshare '%s': " "not currently shared\n"), path); } else { ret = zfs_unshareall_bypath(zhp, path); zfs_commit_all_shares(); } } else { char mtpt_prop[ZFS_MAXPROPLEN]; verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mtpt_prop, sizeof (mtpt_prop), NULL, NULL, 0, B_FALSE) == 0); if (is_manual) { ret = zfs_unmount(zhp, NULL, flags); } else if (strcmp(mtpt_prop, "legacy") == 0) { (void) fprintf(stderr, gettext("cannot unmount " "'%s': legacy mountpoint\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use umount(8) " "to unmount this filesystem\n")); } else { ret = zfs_unmountall(zhp, flags); } } out: zfs_close(zhp); return (ret != 0); } /* * Generic callback for unsharing or unmounting a filesystem. */ static int unshare_unmount(int op, int argc, char **argv) { int do_all = 0; int flags = 0; int ret = 0; int c; zfs_handle_t *zhp; char nfs_mnt_prop[ZFS_MAXPROPLEN]; char sharesmb[ZFS_MAXPROPLEN]; /* check options */ while ((c = getopt(argc, argv, op == OP_SHARE ? ":a" : "afu")) != -1) { switch (c) { case 'a': do_all = 1; break; case 'f': flags |= MS_FORCE; break; case 'u': flags |= MS_CRYPT; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (do_all) { /* * We could make use of zfs_for_each() to walk all datasets in * the system, but this would be very inefficient, especially * since we would have to linearly search /proc/self/mounts for * each one. Instead, do one pass through /proc/self/mounts * looking for zfs entries and call zfs_unmount() for each one. * * Things get a little tricky if the administrator has created * mountpoints beneath other ZFS filesystems. In this case, we * have to unmount the deepest filesystems first. To accomplish * this, we place all the mountpoints in an AVL tree sorted by * the special type (dataset name), and walk the result in * reverse to make sure to get any snapshots first. */ struct mnttab entry; uu_avl_pool_t *pool; uu_avl_t *tree = NULL; unshare_unmount_node_t *node; uu_avl_index_t idx; uu_avl_walk_t *walk; char *protocol = NULL; if (op == OP_SHARE && argc > 0) { if (strcmp(argv[0], "nfs") != 0 && strcmp(argv[0], "smb") != 0) { (void) fprintf(stderr, gettext("share type " "must be 'nfs' or 'smb'\n")); usage(B_FALSE); } protocol = argv[0]; argc--; argv++; } if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (((pool = uu_avl_pool_create("unmount_pool", sizeof (unshare_unmount_node_t), offsetof(unshare_unmount_node_t, un_avlnode), unshare_unmount_compare, UU_DEFAULT)) == NULL) || ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL)) nomem(); /* Reopen MNTTAB to prevent reading stale data from open file */ if (freopen(MNTTAB, "r", mnttab_file) == NULL) return (ENOENT); while (getmntent(mnttab_file, &entry) == 0) { /* ignore non-ZFS entries */ if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) continue; /* ignore snapshots */ if (strchr(entry.mnt_special, '@') != NULL) continue; if ((zhp = zfs_open(g_zfs, entry.mnt_special, ZFS_TYPE_FILESYSTEM)) == NULL) { ret = 1; continue; } /* * Ignore datasets that are excluded/restricted by * parent pool name. */ if (zpool_skip_pool(zfs_get_pool_name(zhp))) { zfs_close(zhp); continue; } switch (op) { case OP_SHARE: verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") != 0) break; verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") == 0) continue; break; case OP_MOUNT: /* Ignore legacy mounts */ verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "legacy") == 0) continue; /* Ignore canmount=noauto mounts */ if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_NOAUTO) continue; default: break; } node = safe_malloc(sizeof (unshare_unmount_node_t)); node->un_zhp = zhp; node->un_mountp = safe_strdup(entry.mnt_mountp); uu_avl_node_init(node, &node->un_avlnode, pool); if (uu_avl_find(tree, node, NULL, &idx) == NULL) { uu_avl_insert(tree, node, idx); } else { zfs_close(node->un_zhp); free(node->un_mountp); free(node); } } /* * Walk the AVL tree in reverse, unmounting each filesystem and * removing it from the AVL tree in the process. */ if ((walk = uu_avl_walk_start(tree, UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) nomem(); while ((node = uu_avl_walk_next(walk)) != NULL) { const char *mntarg = NULL; uu_avl_remove(tree, node); switch (op) { case OP_SHARE: if (zfs_unshareall_bytype(node->un_zhp, node->un_mountp, protocol) != 0) ret = 1; break; case OP_MOUNT: if (zfs_unmount(node->un_zhp, mntarg, flags) != 0) ret = 1; break; } zfs_close(node->un_zhp); free(node->un_mountp); free(node); } if (op == OP_SHARE) zfs_commit_shares(protocol); uu_avl_walk_end(walk); uu_avl_destroy(tree); uu_avl_pool_destroy(pool); } else { if (argc != 1) { if (argc == 0) (void) fprintf(stderr, gettext("missing filesystem argument\n")); else (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* * We have an argument, but it may be a full path or a ZFS * filesystem. Pass full paths off to unmount_path() (shared by * manual_unmount), otherwise open the filesystem and pass to * zfs_unmount(). */ if (argv[0][0] == '/') return (unshare_unmount_path(op, argv[0], flags, B_FALSE)); if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM)) == NULL) return (1); verify(zfs_prop_get(zhp, op == OP_SHARE ? ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); switch (op) { case OP_SHARE: verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, sharesmb, sizeof (sharesmb), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") == 0 && strcmp(sharesmb, "off") == 0) { (void) fprintf(stderr, gettext("cannot " "unshare '%s': legacy share\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use " "unshare(1M) to unshare this " "filesystem\n")); ret = 1; } else if (!zfs_is_shared(zhp)) { (void) fprintf(stderr, gettext("cannot " "unshare '%s': not currently " "shared\n"), zfs_get_name(zhp)); ret = 1; } else if (zfs_unshareall(zhp) != 0) { ret = 1; } break; case OP_MOUNT: if (strcmp(nfs_mnt_prop, "legacy") == 0) { (void) fprintf(stderr, gettext("cannot " "unmount '%s': legacy " "mountpoint\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use " "umount(1M) to unmount this " "filesystem\n")); ret = 1; } else if (!zfs_is_mounted(zhp, NULL)) { (void) fprintf(stderr, gettext("cannot " "unmount '%s': not currently " "mounted\n"), zfs_get_name(zhp)); ret = 1; } else if (zfs_unmountall(zhp, flags) != 0) { ret = 1; } break; } zfs_close(zhp); } return (ret); } /* * zfs unmount [-fu] -a * zfs unmount [-fu] filesystem * * Unmount all filesystems, or a specific ZFS filesystem. */ static int zfs_do_unmount(int argc, char **argv) { return (unshare_unmount(OP_MOUNT, argc, argv)); } /* * zfs unshare -a * zfs unshare filesystem * * Unshare all filesystems, or a specific ZFS filesystem. */ static int zfs_do_unshare(int argc, char **argv) { return (unshare_unmount(OP_SHARE, argc, argv)); } static int find_command_idx(char *command, int *idx) { int i; for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) continue; if (strcmp(command, command_table[i].name) == 0) { *idx = i; return (0); } } return (1); } static int zfs_do_diff(int argc, char **argv) { zfs_handle_t *zhp; int flags = 0; char *tosnap = NULL; char *fromsnap = NULL; char *atp, *copy; int err = 0; int c; struct sigaction sa; while ((c = getopt(argc, argv, "FHt")) != -1) { switch (c) { case 'F': flags |= ZFS_DIFF_CLASSIFY; break; case 'H': flags |= ZFS_DIFF_PARSEABLE; break; case 't': flags |= ZFS_DIFF_TIMESTAMP; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("must provide at least one snapshot name\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } fromsnap = argv[0]; tosnap = (argc == 2) ? argv[1] : NULL; copy = NULL; if (*fromsnap != '@') copy = strdup(fromsnap); else if (tosnap) copy = strdup(tosnap); if (copy == NULL) usage(B_FALSE); if ((atp = strchr(copy, '@')) != NULL) *atp = '\0'; if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL) { free(copy); return (1); } free(copy); /* * Ignore SIGPIPE so that the library can give us * information on any failure */ if (sigemptyset(&sa.sa_mask) == -1) { err = errno; goto out; } sa.sa_flags = 0; sa.sa_handler = SIG_IGN; if (sigaction(SIGPIPE, &sa, NULL) == -1) { err = errno; goto out; } err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags); out: zfs_close(zhp); return (err != 0); } /* * zfs bookmark | * * Creates a bookmark with the given name from the source snapshot * or creates a copy of an existing source bookmark. */ static int zfs_do_bookmark(int argc, char **argv) { char *source, *bookname; char expbuf[ZFS_MAX_DATASET_NAME_LEN]; int source_type; nvlist_t *nvl; int ret = 0; int c; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing source argument\n")); goto usage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing bookmark argument\n")); goto usage; } source = argv[0]; bookname = argv[1]; if (strchr(source, '@') == NULL && strchr(source, '#') == NULL) { (void) fprintf(stderr, gettext("invalid source name '%s': " "must contain a '@' or '#'\n"), source); goto usage; } if (strchr(bookname, '#') == NULL) { (void) fprintf(stderr, gettext("invalid bookmark name '%s': " "must contain a '#'\n"), bookname); goto usage; } /* * expand source or bookname to full path: * one of them may be specified as short name */ { char **expand; char *source_short, *bookname_short; source_short = strpbrk(source, "@#"); bookname_short = strpbrk(bookname, "#"); if (source_short == source && bookname_short == bookname) { (void) fprintf(stderr, gettext( "either source or bookmark must be specified as " "full dataset paths")); goto usage; } else if (source_short != source && bookname_short != bookname) { expand = NULL; } else if (source_short != source) { strlcpy(expbuf, source, sizeof (expbuf)); expand = &bookname; } else if (bookname_short != bookname) { strlcpy(expbuf, bookname, sizeof (expbuf)); expand = &source; } else { abort(); } if (expand != NULL) { *strpbrk(expbuf, "@#") = '\0'; /* dataset name in buf */ (void) strlcat(expbuf, *expand, sizeof (expbuf)); *expand = expbuf; } } /* determine source type */ switch (*strpbrk(source, "@#")) { case '@': source_type = ZFS_TYPE_SNAPSHOT; break; case '#': source_type = ZFS_TYPE_BOOKMARK; break; default: abort(); } /* test the source exists */ zfs_handle_t *zhp; zhp = zfs_open(g_zfs, source, source_type); if (zhp == NULL) goto usage; zfs_close(zhp); nvl = fnvlist_alloc(); fnvlist_add_string(nvl, bookname, source); ret = lzc_bookmark(nvl, NULL); fnvlist_free(nvl); if (ret != 0) { const char *err_msg = NULL; char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create bookmark '%s'"), bookname); switch (ret) { case EXDEV: err_msg = "bookmark is in a different pool"; break; case ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR: err_msg = "source is not an ancestor of the " "new bookmark's dataset"; break; case EEXIST: err_msg = "bookmark exists"; break; case EINVAL: err_msg = "invalid argument"; break; case ENOTSUP: err_msg = "bookmark feature not enabled"; break; case ENOSPC: err_msg = "out of space"; break; case ENOENT: err_msg = "dataset does not exist"; break; default: (void) zfs_standard_error(g_zfs, ret, errbuf); break; } if (err_msg != NULL) { (void) fprintf(stderr, "%s: %s\n", errbuf, dgettext(TEXT_DOMAIN, err_msg)); } } return (ret != 0); usage: usage(B_FALSE); return (-1); } static int zfs_do_channel_program(int argc, char **argv) { int ret, fd, c; char *progbuf, *filename, *poolname; size_t progsize, progread; nvlist_t *outnvl = NULL; uint64_t instrlimit = ZCP_DEFAULT_INSTRLIMIT; uint64_t memlimit = ZCP_DEFAULT_MEMLIMIT; boolean_t sync_flag = B_TRUE, json_output = B_FALSE; zpool_handle_t *zhp; /* check options */ while ((c = getopt(argc, argv, "nt:m:j")) != -1) { switch (c) { case 't': case 'm': { uint64_t arg; char *endp; errno = 0; arg = strtoull(optarg, &endp, 0); if (errno != 0 || *endp != '\0') { (void) fprintf(stderr, gettext( "invalid argument " "'%s': expected integer\n"), optarg); goto usage; } if (c == 't') { instrlimit = arg; } else { ASSERT3U(c, ==, 'm'); memlimit = arg; } break; } case 'n': { sync_flag = B_FALSE; break; } case 'j': { json_output = B_TRUE; break; } case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; if (argc < 2) { (void) fprintf(stderr, gettext("invalid number of arguments\n")); goto usage; } poolname = argv[0]; filename = argv[1]; if (strcmp(filename, "-") == 0) { fd = 0; filename = "standard input"; } else if ((fd = open(filename, O_RDONLY)) < 0) { (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), filename, strerror(errno)); return (1); } if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { (void) fprintf(stderr, gettext("cannot open pool '%s'\n"), poolname); if (fd != 0) (void) close(fd); return (1); } zpool_close(zhp); /* * Read in the channel program, expanding the program buffer as * necessary. */ progread = 0; progsize = 1024; progbuf = safe_malloc(progsize); do { ret = read(fd, progbuf + progread, progsize - progread); progread += ret; if (progread == progsize && ret > 0) { progsize *= 2; progbuf = safe_realloc(progbuf, progsize); } } while (ret > 0); if (fd != 0) (void) close(fd); if (ret < 0) { free(progbuf); (void) fprintf(stderr, gettext("cannot read '%s': %s\n"), filename, strerror(errno)); return (1); } progbuf[progread] = '\0'; /* * Any remaining arguments are passed as arguments to the lua script as * a string array: * { * "argv" -> [ "arg 1", ... "arg n" ], * } */ nvlist_t *argnvl = fnvlist_alloc(); fnvlist_add_string_array(argnvl, ZCP_ARG_CLIARGV, argv + 2, argc - 2); if (sync_flag) { ret = lzc_channel_program(poolname, progbuf, instrlimit, memlimit, argnvl, &outnvl); } else { ret = lzc_channel_program_nosync(poolname, progbuf, instrlimit, memlimit, argnvl, &outnvl); } if (ret != 0) { /* * On error, report the error message handed back by lua if one * exists. Otherwise, generate an appropriate error message, * falling back on strerror() for an unexpected return code. */ char *errstring = NULL; const char *msg = gettext("Channel program execution failed"); uint64_t instructions = 0; if (outnvl != NULL && nvlist_exists(outnvl, ZCP_RET_ERROR)) { (void) nvlist_lookup_string(outnvl, ZCP_RET_ERROR, &errstring); if (errstring == NULL) errstring = strerror(ret); if (ret == ETIME) { (void) nvlist_lookup_uint64(outnvl, ZCP_ARG_INSTRLIMIT, &instructions); } } else { switch (ret) { case EINVAL: errstring = "Invalid instruction or memory limit."; break; case ENOMEM: errstring = "Return value too large."; break; case ENOSPC: errstring = "Memory limit exhausted."; break; case ETIME: errstring = "Timed out."; break; case EPERM: errstring = "Permission denied. Channel " "programs must be run as root."; break; default: (void) zfs_standard_error(g_zfs, ret, msg); } } if (errstring != NULL) (void) fprintf(stderr, "%s:\n%s\n", msg, errstring); if (ret == ETIME && instructions != 0) (void) fprintf(stderr, gettext("%llu Lua instructions\n"), (u_longlong_t)instructions); } else { if (json_output) { (void) nvlist_print_json(stdout, outnvl); } else if (nvlist_empty(outnvl)) { (void) fprintf(stdout, gettext("Channel program fully " "executed and did not produce output.\n")); } else { (void) fprintf(stdout, gettext("Channel program fully " "executed and produced output:\n")); dump_nvlist(outnvl, 4); } } free(progbuf); fnvlist_free(outnvl); fnvlist_free(argnvl); return (ret != 0); usage: usage(B_FALSE); return (-1); } typedef struct loadkey_cbdata { boolean_t cb_loadkey; boolean_t cb_recursive; boolean_t cb_noop; char *cb_keylocation; uint64_t cb_numfailed; uint64_t cb_numattempted; } loadkey_cbdata_t; static int load_key_callback(zfs_handle_t *zhp, void *data) { int ret; boolean_t is_encroot; loadkey_cbdata_t *cb = data; uint64_t keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); /* * If we are working recursively, we want to skip loading / unloading * keys for non-encryption roots and datasets whose keys are already * in the desired end-state. */ if (cb->cb_recursive) { ret = zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); if (ret != 0) return (ret); if (!is_encroot) return (0); if ((cb->cb_loadkey && keystatus == ZFS_KEYSTATUS_AVAILABLE) || (!cb->cb_loadkey && keystatus == ZFS_KEYSTATUS_UNAVAILABLE)) return (0); } cb->cb_numattempted++; if (cb->cb_loadkey) ret = zfs_crypto_load_key(zhp, cb->cb_noop, cb->cb_keylocation); else ret = zfs_crypto_unload_key(zhp); if (ret != 0) { cb->cb_numfailed++; return (ret); } return (0); } static int load_unload_keys(int argc, char **argv, boolean_t loadkey) { int c, ret = 0, flags = 0; boolean_t do_all = B_FALSE; loadkey_cbdata_t cb = { 0 }; cb.cb_loadkey = loadkey; while ((c = getopt(argc, argv, "anrL:")) != -1) { /* noop and alternate keylocations only apply to zfs load-key */ if (loadkey) { switch (c) { case 'n': cb.cb_noop = B_TRUE; continue; case 'L': cb.cb_keylocation = optarg; continue; default: break; } } switch (c) { case 'a': do_all = B_TRUE; cb.cb_recursive = B_TRUE; break; case 'r': flags |= ZFS_ITER_RECURSE; cb.cb_recursive = B_TRUE; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (!do_all && argc == 0) { (void) fprintf(stderr, gettext("Missing dataset argument or -a option\n")); usage(B_FALSE); } if (do_all && argc != 0) { (void) fprintf(stderr, gettext("Cannot specify dataset with -a option\n")); usage(B_FALSE); } if (cb.cb_recursive && cb.cb_keylocation != NULL && strcmp(cb.cb_keylocation, "prompt") != 0) { (void) fprintf(stderr, gettext("alternate keylocation may only " "be 'prompt' with -r or -a\n")); usage(B_FALSE); } ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, NULL, NULL, 0, load_key_callback, &cb); if (cb.cb_noop || (cb.cb_recursive && cb.cb_numattempted != 0)) { (void) printf(gettext("%llu / %llu key(s) successfully %s\n"), (u_longlong_t)(cb.cb_numattempted - cb.cb_numfailed), (u_longlong_t)cb.cb_numattempted, loadkey ? (cb.cb_noop ? "verified" : "loaded") : "unloaded"); } if (cb.cb_numfailed != 0) ret = -1; return (ret); } static int zfs_do_load_key(int argc, char **argv) { return (load_unload_keys(argc, argv, B_TRUE)); } static int zfs_do_unload_key(int argc, char **argv) { return (load_unload_keys(argc, argv, B_FALSE)); } static int zfs_do_change_key(int argc, char **argv) { int c, ret; uint64_t keystatus; boolean_t loadkey = B_FALSE, inheritkey = B_FALSE; zfs_handle_t *zhp = NULL; nvlist_t *props = fnvlist_alloc(); while ((c = getopt(argc, argv, "lio:")) != -1) { switch (c) { case 'l': loadkey = B_TRUE; break; case 'i': inheritkey = B_TRUE; break; case 'o': if (!parseprop(props, optarg)) { nvlist_free(props); return (1); } break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (inheritkey && !nvlist_empty(props)) { (void) fprintf(stderr, gettext("Properties not allowed for inheriting\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("Missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("Too many arguments\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[argc - 1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) usage(B_FALSE); if (loadkey) { keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); if (keystatus != ZFS_KEYSTATUS_AVAILABLE) { ret = zfs_crypto_load_key(zhp, B_FALSE, NULL); if (ret != 0) { nvlist_free(props); zfs_close(zhp); return (-1); } } /* refresh the properties so the new keystatus is visible */ zfs_refresh_properties(zhp); } ret = zfs_crypto_rewrap(zhp, props, inheritkey); if (ret != 0) { nvlist_free(props); zfs_close(zhp); return (-1); } nvlist_free(props); zfs_close(zhp); return (0); } /* * 1) zfs project [-d|-r] * List project ID and inherit flag of file(s) or directories. * -d: List the directory itself, not its children. * -r: List subdirectories recursively. * * 2) zfs project -C [-k] [-r] * Clear project inherit flag and/or ID on the file(s) or directories. * -k: Keep the project ID unchanged. If not specified, the project ID * will be reset as zero. * -r: Clear on subdirectories recursively. * * 3) zfs project -c [-0] [-d|-r] [-p id] * Check project ID and inherit flag on the file(s) or directories, * report the outliers. * -0: Print file name followed by a NUL instead of newline. * -d: Check the directory itself, not its children. * -p: Specify the referenced ID for comparing with the target file(s) * or directories' project IDs. If not specified, the target (top) * directory's project ID will be used as the referenced one. * -r: Check subdirectories recursively. * * 4) zfs project [-p id] [-r] [-s] * Set project ID and/or inherit flag on the file(s) or directories. * -p: Set the project ID as the given id. * -r: Set on subdirectories recursively. If not specify "-p" option, * it will use top-level directory's project ID as the given id, * then set both project ID and inherit flag on all descendants * of the top-level directory. * -s: Set project inherit flag. */ static int zfs_do_project(int argc, char **argv) { zfs_project_control_t zpc = { .zpc_expected_projid = ZFS_INVALID_PROJID, .zpc_op = ZFS_PROJECT_OP_DEFAULT, .zpc_dironly = B_FALSE, .zpc_keep_projid = B_FALSE, .zpc_newline = B_TRUE, .zpc_recursive = B_FALSE, .zpc_set_flag = B_FALSE, }; int ret = 0, c; if (argc < 2) usage(B_FALSE); while ((c = getopt(argc, argv, "0Ccdkp:rs")) != -1) { switch (c) { case '0': zpc.zpc_newline = B_FALSE; break; case 'C': if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { (void) fprintf(stderr, gettext("cannot " "specify '-C' '-c' '-s' together\n")); usage(B_FALSE); } zpc.zpc_op = ZFS_PROJECT_OP_CLEAR; break; case 'c': if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { (void) fprintf(stderr, gettext("cannot " "specify '-C' '-c' '-s' together\n")); usage(B_FALSE); } zpc.zpc_op = ZFS_PROJECT_OP_CHECK; break; case 'd': zpc.zpc_dironly = B_TRUE; /* overwrite "-r" option */ zpc.zpc_recursive = B_FALSE; break; case 'k': zpc.zpc_keep_projid = B_TRUE; break; case 'p': { char *endptr; errno = 0; zpc.zpc_expected_projid = strtoull(optarg, &endptr, 0); if (errno != 0 || *endptr != '\0') { (void) fprintf(stderr, gettext("project ID must be less than " "%u\n"), UINT32_MAX); usage(B_FALSE); } if (zpc.zpc_expected_projid >= UINT32_MAX) { (void) fprintf(stderr, gettext("invalid project ID\n")); usage(B_FALSE); } break; } case 'r': zpc.zpc_recursive = B_TRUE; /* overwrite "-d" option */ zpc.zpc_dironly = B_FALSE; break; case 's': if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { (void) fprintf(stderr, gettext("cannot " "specify '-C' '-c' '-s' together\n")); usage(B_FALSE); } zpc.zpc_set_flag = B_TRUE; zpc.zpc_op = ZFS_PROJECT_OP_SET; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (zpc.zpc_op == ZFS_PROJECT_OP_DEFAULT) { if (zpc.zpc_expected_projid != ZFS_INVALID_PROJID) zpc.zpc_op = ZFS_PROJECT_OP_SET; else zpc.zpc_op = ZFS_PROJECT_OP_LIST; } switch (zpc.zpc_op) { case ZFS_PROJECT_OP_LIST: if (zpc.zpc_keep_projid) { (void) fprintf(stderr, gettext("'-k' is only valid together with '-C'\n")); usage(B_FALSE); } if (!zpc.zpc_newline) { (void) fprintf(stderr, gettext("'-0' is only valid together with '-c'\n")); usage(B_FALSE); } break; case ZFS_PROJECT_OP_CHECK: if (zpc.zpc_keep_projid) { (void) fprintf(stderr, gettext("'-k' is only valid together with '-C'\n")); usage(B_FALSE); } break; case ZFS_PROJECT_OP_CLEAR: if (zpc.zpc_dironly) { (void) fprintf(stderr, gettext("'-d' is useless together with '-C'\n")); usage(B_FALSE); } if (!zpc.zpc_newline) { (void) fprintf(stderr, gettext("'-0' is only valid together with '-c'\n")); usage(B_FALSE); } if (zpc.zpc_expected_projid != ZFS_INVALID_PROJID) { (void) fprintf(stderr, gettext("'-p' is useless together with '-C'\n")); usage(B_FALSE); } break; case ZFS_PROJECT_OP_SET: if (zpc.zpc_dironly) { (void) fprintf(stderr, gettext("'-d' is useless for set project ID and/or " "inherit flag\n")); usage(B_FALSE); } if (zpc.zpc_keep_projid) { (void) fprintf(stderr, gettext("'-k' is only valid together with '-C'\n")); usage(B_FALSE); } if (!zpc.zpc_newline) { (void) fprintf(stderr, gettext("'-0' is only valid together with '-c'\n")); usage(B_FALSE); } break; default: ASSERT(0); break; } argv += optind; argc -= optind; if (argc == 0) { (void) fprintf(stderr, gettext("missing file or directory target(s)\n")); usage(B_FALSE); } for (int i = 0; i < argc; i++) { int err; err = zfs_project_handle(argv[i], &zpc); if (err && !ret) ret = err; } return (ret); } static int zfs_do_wait(int argc, char **argv) { boolean_t enabled[ZFS_WAIT_NUM_ACTIVITIES]; int error, i; char c; /* By default, wait for all types of activity. */ for (i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) enabled[i] = B_TRUE; while ((c = getopt(argc, argv, "t:")) != -1) { switch (c) { case 't': { static char *col_subopts[] = { "deleteq", NULL }; char *value; /* Reset activities array */ bzero(&enabled, sizeof (enabled)); while (*optarg != '\0') { int activity = getsubopt(&optarg, col_subopts, &value); if (activity < 0) { (void) fprintf(stderr, gettext("invalid activity '%s'\n"), value); usage(B_FALSE); } enabled[activity] = B_TRUE; } break; } case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argv += optind; argc -= optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing 'filesystem' " "argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } zfs_handle_t *zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM); if (zhp == NULL) return (1); for (;;) { boolean_t missing = B_FALSE; boolean_t any_waited = B_FALSE; for (int i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) { boolean_t waited; if (!enabled[i]) continue; error = zfs_wait_status(zhp, i, &missing, &waited); if (error != 0 || missing) break; any_waited = (any_waited || waited); } if (error != 0 || missing || !any_waited) break; } zfs_close(zhp); return (error); } /* * Display version message */ static int zfs_do_version(int argc, char **argv) { if (zfs_version_print() == -1) return (1); return (0); } int main(int argc, char **argv) { int ret = 0; int i = 0; char *cmdname; char **newargv; (void) setlocale(LC_ALL, ""); (void) textdomain(TEXT_DOMAIN); opterr = 0; /* * Make sure the user has specified some command. */ if (argc < 2) { (void) fprintf(stderr, gettext("missing command\n")); usage(B_FALSE); } cmdname = argv[1]; /* * The 'umount' command is an alias for 'unmount' */ if (strcmp(cmdname, "umount") == 0) cmdname = "unmount"; /* * The 'recv' command is an alias for 'receive' */ if (strcmp(cmdname, "recv") == 0) cmdname = "receive"; /* * The 'snap' command is an alias for 'snapshot' */ if (strcmp(cmdname, "snap") == 0) cmdname = "snapshot"; /* * Special case '-?' */ if ((strcmp(cmdname, "-?") == 0) || (strcmp(cmdname, "--help") == 0)) usage(B_TRUE); /* * Special case '-V|--version' */ if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) return (zfs_do_version(argc, argv)); if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (1); } mnttab_file = g_zfs->libzfs_mnttab; zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); libzfs_print_on_error(g_zfs, B_TRUE); /* * Many commands modify input strings for string parsing reasons. * We create a copy to protect the original argv. */ newargv = malloc((argc + 1) * sizeof (newargv[0])); for (i = 0; i < argc; i++) newargv[i] = strdup(argv[i]); newargv[argc] = NULL; /* * Run the appropriate command. */ libzfs_mnttab_cache(g_zfs, B_TRUE); if (find_command_idx(cmdname, &i) == 0) { current_command = &command_table[i]; ret = command_table[i].func(argc - 1, newargv + 1); } else if (strchr(cmdname, '=') != NULL) { verify(find_command_idx("set", &i) == 0); current_command = &command_table[i]; ret = command_table[i].func(argc, newargv); } else { (void) fprintf(stderr, gettext("unrecognized " "command '%s'\n"), cmdname); usage(B_FALSE); ret = 1; } for (i = 0; i < argc; i++) free(newargv[i]); free(newargv); if (ret == 0 && log_history) (void) zpool_log_history(g_zfs, history_str); libzfs_fini(g_zfs); /* * The 'ZFS_ABORT' environment variable causes us to dump core on exit * for the purposes of running ::findleaks. */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } return (ret); } #ifdef __FreeBSD__ #include #include /* * Attach/detach the given dataset to/from the given jail */ /* ARGSUSED */ static int zfs_do_jail_impl(int argc, char **argv, boolean_t attach) { zfs_handle_t *zhp; int jailid, ret; /* check number of arguments */ if (argc < 3) { (void) fprintf(stderr, gettext("missing argument(s)\n")); usage(B_FALSE); } if (argc > 3) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } jailid = jail_getid(argv[1]); if (jailid < 0) { (void) fprintf(stderr, gettext("invalid jail id or name\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM); if (zhp == NULL) return (1); ret = (zfs_jail(zhp, jailid, attach) != 0); zfs_close(zhp); return (ret); } /* * zfs jail jailid filesystem * * Attach the given dataset to the given jail */ /* ARGSUSED */ static int zfs_do_jail(int argc, char **argv) { return (zfs_do_jail_impl(argc, argv, B_TRUE)); } /* * zfs unjail jailid filesystem * * Detach the given dataset from the given jail */ /* ARGSUSED */ static int zfs_do_unjail(int argc, char **argv) { return (zfs_do_jail_impl(argc, argv, B_FALSE)); } #endif Index: vendor-sys/openzfs/dist/config/kernel-global_page_state.m4 =================================================================== --- vendor-sys/openzfs/dist/config/kernel-global_page_state.m4 (revision 365339) +++ vendor-sys/openzfs/dist/config/kernel-global_page_state.m4 (revision 365340) @@ -1,137 +1,128 @@ dnl # dnl # 4.8 API change dnl # dnl # 75ef71840539 mm, vmstat: add infrastructure for per-node vmstats dnl # 599d0c954f91 mm, vmscan: move LRU lists to node dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_GLOBAL_NODE_PAGE_STATE], [ ZFS_LINUX_TEST_SRC([global_node_page_state], [ #include #include ],[ (void) global_node_page_state(0); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_NODE_PAGE_STATE], [ AC_MSG_CHECKING([whether global_node_page_state() exists]) ZFS_LINUX_TEST_RESULT([global_node_page_state], [ AC_MSG_RESULT(yes) AC_DEFINE(ZFS_GLOBAL_NODE_PAGE_STATE, 1, [global_node_page_state() exists]) ],[ AC_MSG_RESULT(no) ]) ]) dnl # dnl # 4.14 API change dnl # dnl # c41f012ade0b mm: rename global_page_state to global_zone_page_state dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_GLOBAL_ZONE_PAGE_STATE], [ ZFS_LINUX_TEST_SRC([global_zone_page_state], [ #include #include ],[ (void) global_zone_page_state(0); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_ZONE_PAGE_STATE], [ AC_MSG_CHECKING([whether global_zone_page_state() exists]) ZFS_LINUX_TEST_RESULT([global_zone_page_state], [ AC_MSG_RESULT(yes) AC_DEFINE(ZFS_GLOBAL_ZONE_PAGE_STATE, 1, [global_zone_page_state() exists]) ],[ AC_MSG_RESULT(no) ]) ]) dnl # dnl # Create a define and autoconf variable for an enum member dnl # AC_DEFUN([ZFS_AC_KERNEL_ENUM_MEMBER], [ AC_MSG_CHECKING([whether enum $2 contains $1]) AS_IF([AC_TRY_COMMAND( "${srcdir}/scripts/enum-extract.pl" "$2" "$3" | egrep -qx $1)],[ AC_MSG_RESULT([yes]) AC_DEFINE(m4_join([_], [ZFS_ENUM], m4_toupper($2), $1), 1, [enum $2 contains $1]) m4_join([_], [ZFS_ENUM], m4_toupper($2), $1)=1 ],[ AC_MSG_RESULT([no]) ]) ]) dnl # dnl # Sanity check helpers dnl # AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_ERROR],[ AC_MSG_RESULT(no) AC_MSG_RESULT([$1 in either node_stat_item or zone_stat_item: $2]) ZFS_LINUX_TEST_ERROR([global page state]) ]) AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_CHECK], [ enum_check_a="m4_join([_], [$ZFS_ENUM_NODE_STAT_ITEM], $1)" enum_check_b="m4_join([_], [$ZFS_ENUM_ZONE_STAT_ITEM], $1)" AS_IF([test -n "$enum_check_a" -a -n "$enum_check_b"],[ ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_ERROR([$1], [DUPLICATE]) ]) AS_IF([test -z "$enum_check_a" -a -z "$enum_check_b"],[ ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_ERROR([$1], [NOT FOUND]) ]) ]) dnl # dnl # Ensure the config tests are finding one and only one of each enum. dnl # AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_ZONE_PAGE_STATE_SANITY], [ AC_MSG_CHECKING([whether global_page_state enums are sane]) ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_CHECK([NR_FILE_PAGES]) ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_CHECK([NR_INACTIVE_ANON]) ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_CHECK([NR_INACTIVE_FILE]) - AS_IF([test -z "$ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE_B"],[ - ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_CHECK([NR_SLAB_RECLAIMABLE]) - ]) AC_MSG_RESULT(yes) ]) AC_DEFUN([ZFS_AC_KERNEL_SRC_GLOBAL_PAGE_STATE], [ ZFS_AC_KERNEL_SRC_GLOBAL_NODE_PAGE_STATE ZFS_AC_KERNEL_SRC_GLOBAL_ZONE_PAGE_STATE ]) dnl # dnl # enum members in which we're interested dnl # AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_PAGE_STATE], [ ZFS_AC_KERNEL_GLOBAL_NODE_PAGE_STATE ZFS_AC_KERNEL_GLOBAL_ZONE_PAGE_STATE ZFS_AC_KERNEL_ENUM_MEMBER([NR_FILE_PAGES], [node_stat_item], [$LINUX/include/linux/mmzone.h]) ZFS_AC_KERNEL_ENUM_MEMBER([NR_INACTIVE_ANON], [node_stat_item], [$LINUX/include/linux/mmzone.h]) ZFS_AC_KERNEL_ENUM_MEMBER([NR_INACTIVE_FILE], [node_stat_item], [$LINUX/include/linux/mmzone.h]) - ZFS_AC_KERNEL_ENUM_MEMBER([NR_SLAB_RECLAIMABLE], - [node_stat_item], [$LINUX/include/linux/mmzone.h]) - ZFS_AC_KERNEL_ENUM_MEMBER([NR_SLAB_RECLAIMABLE_B], - [node_stat_item], [$LINUX/include/linux/mmzone.h]) ZFS_AC_KERNEL_ENUM_MEMBER([NR_FILE_PAGES], [zone_stat_item], [$LINUX/include/linux/mmzone.h]) ZFS_AC_KERNEL_ENUM_MEMBER([NR_INACTIVE_ANON], [zone_stat_item], [$LINUX/include/linux/mmzone.h]) ZFS_AC_KERNEL_ENUM_MEMBER([NR_INACTIVE_FILE], - [zone_stat_item], [$LINUX/include/linux/mmzone.h]) - ZFS_AC_KERNEL_ENUM_MEMBER([NR_SLAB_RECLAIMABLE], [zone_stat_item], [$LINUX/include/linux/mmzone.h]) ZFS_AC_KERNEL_GLOBAL_ZONE_PAGE_STATE_SANITY ]) Index: vendor-sys/openzfs/dist/config/zfs-build.m4 =================================================================== --- vendor-sys/openzfs/dist/config/zfs-build.m4 (revision 365339) +++ vendor-sys/openzfs/dist/config/zfs-build.m4 (revision 365340) @@ -1,554 +1,564 @@ AC_DEFUN([ZFS_AC_LICENSE], [ AC_MSG_CHECKING([zfs author]) AC_MSG_RESULT([$ZFS_META_AUTHOR]) AC_MSG_CHECKING([zfs license]) AC_MSG_RESULT([$ZFS_META_LICENSE]) ]) AC_DEFUN([ZFS_AC_DEBUG_ENABLE], [ DEBUG_CFLAGS="-Werror" DEBUG_CPPFLAGS="-DDEBUG -UNDEBUG" DEBUG_LDFLAGS="" DEBUG_ZFS="_with_debug" AC_DEFINE(ZFS_DEBUG, 1, [zfs debugging enabled]) KERNEL_DEBUG_CFLAGS="-Werror" KERNEL_DEBUG_CPPFLAGS="-DDEBUG -UNDEBUG" ]) AC_DEFUN([ZFS_AC_DEBUG_DISABLE], [ DEBUG_CFLAGS="" DEBUG_CPPFLAGS="-UDEBUG -DNDEBUG" DEBUG_LDFLAGS="" DEBUG_ZFS="_without_debug" KERNEL_DEBUG_CFLAGS="" KERNEL_DEBUG_CPPFLAGS="-UDEBUG -DNDEBUG" ]) dnl # dnl # When debugging is enabled: dnl # - Enable all ASSERTs (-DDEBUG) dnl # - Promote all compiler warnings to errors (-Werror) dnl # AC_DEFUN([ZFS_AC_DEBUG], [ AC_MSG_CHECKING([whether assertion support will be enabled]) AC_ARG_ENABLE([debug], [AS_HELP_STRING([--enable-debug], [Enable compiler and code assertions @<:@default=no@:>@])], [], [enable_debug=no]) AS_CASE(["x$enable_debug"], ["xyes"], [ZFS_AC_DEBUG_ENABLE], ["xno"], [ZFS_AC_DEBUG_DISABLE], [AC_MSG_ERROR([Unknown option $enable_debug])]) AC_SUBST(DEBUG_CFLAGS) AC_SUBST(DEBUG_CPPFLAGS) AC_SUBST(DEBUG_LDFLAGS) AC_SUBST(DEBUG_ZFS) AC_SUBST(KERNEL_DEBUG_CFLAGS) AC_SUBST(KERNEL_DEBUG_CPPFLAGS) AC_MSG_RESULT([$enable_debug]) ]) AC_DEFUN([ZFS_AC_DEBUGINFO_ENABLE], [ DEBUG_CFLAGS="$DEBUG_CFLAGS -g -fno-inline $NO_IPA_SRA" KERNEL_DEBUG_CFLAGS="$KERNEL_DEBUG_CFLAGS -fno-inline $NO_IPA_SRA" KERNEL_MAKE="$KERNEL_MAKE CONFIG_DEBUG_INFO=y" DEBUGINFO_ZFS="_with_debuginfo" ]) AC_DEFUN([ZFS_AC_DEBUGINFO_DISABLE], [ DEBUGINFO_ZFS="_without_debuginfo" ]) AC_DEFUN([ZFS_AC_DEBUGINFO], [ AC_MSG_CHECKING([whether debuginfo support will be forced]) AC_ARG_ENABLE([debuginfo], [AS_HELP_STRING([--enable-debuginfo], [Force generation of debuginfo @<:@default=no@:>@])], [], [enable_debuginfo=no]) AS_CASE(["x$enable_debuginfo"], ["xyes"], [ZFS_AC_DEBUGINFO_ENABLE], ["xno"], [ZFS_AC_DEBUGINFO_DISABLE], [AC_MSG_ERROR([Unknown option $enable_debuginfo])]) AC_SUBST(DEBUG_CFLAGS) AC_SUBST(DEBUGINFO_ZFS) AC_SUBST(KERNEL_DEBUG_CFLAGS) AC_SUBST(KERNEL_MAKE) AC_MSG_RESULT([$enable_debuginfo]) ]) dnl # dnl # Disabled by default, provides basic memory tracking. Track the total dnl # number of bytes allocated with kmem_alloc() and freed with kmem_free(). dnl # Then at module unload time if any bytes were leaked it will be reported dnl # on the console. dnl # AC_DEFUN([ZFS_AC_DEBUG_KMEM], [ AC_MSG_CHECKING([whether basic kmem accounting is enabled]) AC_ARG_ENABLE([debug-kmem], [AS_HELP_STRING([--enable-debug-kmem], [Enable basic kmem accounting @<:@default=no@:>@])], [], [enable_debug_kmem=no]) AS_IF([test "x$enable_debug_kmem" = xyes], [ KERNEL_DEBUG_CPPFLAGS="${KERNEL_DEBUG_CPPFLAGS} -DDEBUG_KMEM" DEBUG_KMEM_ZFS="_with_debug_kmem" ], [ DEBUG_KMEM_ZFS="_without_debug_kmem" ]) AC_SUBST(KERNEL_DEBUG_CPPFLAGS) AC_SUBST(DEBUG_KMEM_ZFS) AC_MSG_RESULT([$enable_debug_kmem]) ]) dnl # dnl # Disabled by default, provides detailed memory tracking. This feature dnl # also requires --enable-debug-kmem to be set. When enabled not only will dnl # total bytes be tracked but also the location of every kmem_alloc() and dnl # kmem_free(). When the module is unloaded a list of all leaked addresses dnl # and where they were allocated will be dumped to the console. Enabling dnl # this feature has a significant impact on performance but it makes finding dnl # memory leaks straight forward. dnl # AC_DEFUN([ZFS_AC_DEBUG_KMEM_TRACKING], [ AC_MSG_CHECKING([whether detailed kmem tracking is enabled]) AC_ARG_ENABLE([debug-kmem-tracking], [AS_HELP_STRING([--enable-debug-kmem-tracking], [Enable detailed kmem tracking @<:@default=no@:>@])], [], [enable_debug_kmem_tracking=no]) AS_IF([test "x$enable_debug_kmem_tracking" = xyes], [ KERNEL_DEBUG_CPPFLAGS="${KERNEL_DEBUG_CPPFLAGS} -DDEBUG_KMEM_TRACKING" DEBUG_KMEM_TRACKING_ZFS="_with_debug_kmem_tracking" ], [ DEBUG_KMEM_TRACKING_ZFS="_without_debug_kmem_tracking" ]) AC_SUBST(KERNEL_DEBUG_CPPFLAGS) AC_SUBST(DEBUG_KMEM_TRACKING_ZFS) AC_MSG_RESULT([$enable_debug_kmem_tracking]) ]) AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [ ZFS_AC_CONFIG_ALWAYS_CC_NO_UNUSED_BUT_SET_VARIABLE ZFS_AC_CONFIG_ALWAYS_CC_NO_BOOL_COMPARE ZFS_AC_CONFIG_ALWAYS_CC_FRAME_LARGER_THAN ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_TRUNCATION ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_ZERO_LENGTH ZFS_AC_CONFIG_ALWAYS_CC_NO_OMIT_FRAME_POINTER ZFS_AC_CONFIG_ALWAYS_CC_NO_IPA_SRA ZFS_AC_CONFIG_ALWAYS_CC_ASAN ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD ZFS_AC_CONFIG_ALWAYS_SYSTEM ZFS_AC_CONFIG_ALWAYS_ARCH ZFS_AC_CONFIG_ALWAYS_PYTHON ZFS_AC_CONFIG_ALWAYS_PYZFS ZFS_AC_CONFIG_ALWAYS_SED ]) AC_DEFUN([ZFS_AC_CONFIG], [ dnl # Remove the previous build test directory. rm -Rf build ZFS_CONFIG=all AC_ARG_WITH([config], AS_HELP_STRING([--with-config=CONFIG], [Config file 'kernel|user|all|srpm']), [ZFS_CONFIG="$withval"]) AC_ARG_ENABLE([linux-builtin], [AC_HELP_STRING([--enable-linux-builtin], [Configure for builtin in-tree kernel modules @<:@default=no@:>@])], [], [enable_linux_builtin=no]) AC_MSG_CHECKING([zfs config]) AC_MSG_RESULT([$ZFS_CONFIG]); AC_SUBST(ZFS_CONFIG) ZFS_AC_CONFIG_ALWAYS AM_COND_IF([BUILD_LINUX], [ AC_ARG_VAR([TEST_JOBS], [simultaneous jobs during configure (defaults to $(nproc))]) if test "x$ac_cv_env_TEST_JOBS_set" != "xset"; then TEST_JOBS=$(nproc) fi AC_SUBST(TEST_JOBS) ]) case "$ZFS_CONFIG" in kernel) ZFS_AC_CONFIG_KERNEL ;; user) ZFS_AC_CONFIG_USER ;; all) ZFS_AC_CONFIG_USER ZFS_AC_CONFIG_KERNEL ;; srpm) ;; *) AC_MSG_RESULT([Error!]) AC_MSG_ERROR([Bad value "$ZFS_CONFIG" for --with-config, user kernel|user|all|srpm]) ;; esac AM_CONDITIONAL([CONFIG_USER], [test "$ZFS_CONFIG" = user -o "$ZFS_CONFIG" = all]) AM_CONDITIONAL([CONFIG_KERNEL], [test "$ZFS_CONFIG" = kernel -o "$ZFS_CONFIG" = all] && [test "x$enable_linux_builtin" != xyes ]) AM_CONDITIONAL([CONFIG_QAT], [test "$ZFS_CONFIG" = kernel -o "$ZFS_CONFIG" = all] && [test "x$qatsrc" != x ]) AM_CONDITIONAL([WANT_DEVNAME2DEVID], [test "x$user_libudev" = xyes ]) AM_CONDITIONAL([WANT_MMAP_LIBAIO], [test "x$user_libaio" = xyes ]) AM_CONDITIONAL([PAM_ZFS_ENABLED], [test "x$enable_pam" = xyes]) ]) dnl # dnl # Check for rpm+rpmbuild to build RPM packages. If these tools dnl # are missing it is non-fatal but you will not be able to build dnl # RPM packages and will be warned if you try too. dnl # dnl # By default the generic spec file will be used because it requires dnl # minimal dependencies. Distribution specific spec files can be dnl # placed under the 'rpm/' directory and enabled using dnl # the --with-spec= configure option. dnl # AC_DEFUN([ZFS_AC_RPM], [ RPM=rpm RPMBUILD=rpmbuild AC_MSG_CHECKING([whether $RPM is available]) AS_IF([tmp=$($RPM --version 2>/dev/null)], [ RPM_VERSION=$(echo $tmp | $AWK '/RPM/ { print $[3] }') HAVE_RPM=yes AC_MSG_RESULT([$HAVE_RPM ($RPM_VERSION)]) ],[ HAVE_RPM=no AC_MSG_RESULT([$HAVE_RPM]) ]) AC_MSG_CHECKING([whether $RPMBUILD is available]) AS_IF([tmp=$($RPMBUILD --version 2>/dev/null)], [ RPMBUILD_VERSION=$(echo $tmp | $AWK '/RPM/ { print $[3] }') HAVE_RPMBUILD=yes AC_MSG_RESULT([$HAVE_RPMBUILD ($RPMBUILD_VERSION)]) ],[ HAVE_RPMBUILD=no AC_MSG_RESULT([$HAVE_RPMBUILD]) ]) RPM_DEFINE_COMMON='--define "$(DEBUG_ZFS) 1"' RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUGINFO_ZFS) 1"' RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUG_KMEM_ZFS) 1"' RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUG_KMEM_TRACKING_ZFS) 1"' RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(ASAN_ZFS) 1"' RPM_DEFINE_UTIL=' --define "_initconfdir $(initconfdir)"' dnl # Make the next three RPM_DEFINE_UTIL additions conditional, since dnl # their values may not be set when running: dnl # dnl # ./configure --with-config=srpm dnl # AS_IF([test -n "$dracutdir" ], [ RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_dracutdir $(dracutdir)"' ]) AS_IF([test -n "$udevdir" ], [ RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_udevdir $(udevdir)"' ]) AS_IF([test -n "$udevruledir" ], [ RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_udevruledir $(udevruledir)"' ]) RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_INITRAMFS)' RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_SYSTEMD)' RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYZFS)' RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PAM)' RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYTHON_VERSION)' RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYTHON_PKG_VERSION)' dnl # Override default lib directory on Debian/Ubuntu systems. The dnl # provided /usr/lib/rpm/platform//macros files do not dnl # specify the correct path for multiarch systems as described dnl # by the packaging guidelines. dnl # dnl # https://wiki.ubuntu.com/MultiarchSpec dnl # https://wiki.debian.org/Multiarch/Implementation dnl # AS_IF([test "$DEFAULT_PACKAGE" = "deb"], [ MULTIARCH_LIBDIR="lib/$(dpkg-architecture -qDEB_HOST_MULTIARCH)" RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_lib $(MULTIARCH_LIBDIR)"' AC_SUBST(MULTIARCH_LIBDIR) ]) dnl # Make RPM_DEFINE_KMOD additions conditional on CONFIG_KERNEL, dnl # since the values will not be set otherwise. The spec files dnl # provide defaults for them. dnl # RPM_DEFINE_KMOD='--define "_wrong_version_format_terminate_build 0"' AM_COND_IF([CONFIG_KERNEL], [ RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernels $(LINUX_VERSION)"' RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "ksrc $(LINUX)"' RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kobj $(LINUX_OBJ)"' ]) RPM_DEFINE_DKMS='' SRPM_DEFINE_COMMON='--define "build_src_rpm 1"' SRPM_DEFINE_UTIL= SRPM_DEFINE_KMOD= SRPM_DEFINE_DKMS= RPM_SPEC_DIR="rpm/generic" AC_ARG_WITH([spec], AS_HELP_STRING([--with-spec=SPEC], [Spec files 'generic|redhat']), [RPM_SPEC_DIR="rpm/$withval"]) AC_MSG_CHECKING([whether spec files are available]) AC_MSG_RESULT([yes ($RPM_SPEC_DIR/*.spec.in)]) AC_SUBST(HAVE_RPM) AC_SUBST(RPM) AC_SUBST(RPM_VERSION) AC_SUBST(HAVE_RPMBUILD) AC_SUBST(RPMBUILD) AC_SUBST(RPMBUILD_VERSION) AC_SUBST(RPM_SPEC_DIR) AC_SUBST(RPM_DEFINE_UTIL) AC_SUBST(RPM_DEFINE_KMOD) AC_SUBST(RPM_DEFINE_DKMS) AC_SUBST(RPM_DEFINE_COMMON) AC_SUBST(SRPM_DEFINE_UTIL) AC_SUBST(SRPM_DEFINE_KMOD) AC_SUBST(SRPM_DEFINE_DKMS) AC_SUBST(SRPM_DEFINE_COMMON) ]) dnl # dnl # Check for dpkg+dpkg-buildpackage to build DEB packages. If these dnl # tools are missing it is non-fatal but you will not be able to build dnl # DEB packages and will be warned if you try too. dnl # AC_DEFUN([ZFS_AC_DPKG], [ DPKG=dpkg DPKGBUILD=dpkg-buildpackage AC_MSG_CHECKING([whether $DPKG is available]) AS_IF([tmp=$($DPKG --version 2>/dev/null)], [ DPKG_VERSION=$(echo $tmp | $AWK '/Debian/ { print $[7] }') HAVE_DPKG=yes AC_MSG_RESULT([$HAVE_DPKG ($DPKG_VERSION)]) ],[ HAVE_DPKG=no AC_MSG_RESULT([$HAVE_DPKG]) ]) AC_MSG_CHECKING([whether $DPKGBUILD is available]) AS_IF([tmp=$($DPKGBUILD --version 2>/dev/null)], [ DPKGBUILD_VERSION=$(echo $tmp | \ $AWK '/Debian/ { print $[4] }' | cut -f-4 -d'.') HAVE_DPKGBUILD=yes AC_MSG_RESULT([$HAVE_DPKGBUILD ($DPKGBUILD_VERSION)]) ],[ HAVE_DPKGBUILD=no AC_MSG_RESULT([$HAVE_DPKGBUILD]) ]) AC_SUBST(HAVE_DPKG) AC_SUBST(DPKG) AC_SUBST(DPKG_VERSION) AC_SUBST(HAVE_DPKGBUILD) AC_SUBST(DPKGBUILD) AC_SUBST(DPKGBUILD_VERSION) ]) dnl # dnl # Until native packaging for various different packing systems dnl # can be added the least we can do is attempt to use alien to dnl # convert the RPM packages to the needed package type. This is dnl # a hack but so far it has worked reasonable well. dnl # AC_DEFUN([ZFS_AC_ALIEN], [ ALIEN=alien AC_MSG_CHECKING([whether $ALIEN is available]) AS_IF([tmp=$($ALIEN --version 2>/dev/null)], [ ALIEN_VERSION=$(echo $tmp | $AWK '{ print $[3] }') HAVE_ALIEN=yes AC_MSG_RESULT([$HAVE_ALIEN ($ALIEN_VERSION)]) ],[ HAVE_ALIEN=no AC_MSG_RESULT([$HAVE_ALIEN]) ]) AC_SUBST(HAVE_ALIEN) AC_SUBST(ALIEN) AC_SUBST(ALIEN_VERSION) ]) dnl # dnl # Using the VENDOR tag from config.guess set the default dnl # package type for 'make pkg': (rpm | deb | tgz) dnl # AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([os distribution]) - if test -f /etc/toss-release ; then - VENDOR=toss ; - elif test -f /etc/fedora-release ; then - VENDOR=fedora ; - elif test -f /etc/redhat-release ; then - VENDOR=redhat ; - elif test -f /etc/gentoo-release ; then - VENDOR=gentoo ; - elif test -f /etc/arch-release ; then - VENDOR=arch ; - elif test -f /etc/SuSE-release ; then - VENDOR=sles ; - elif test -f /etc/slackware-version ; then - VENDOR=slackware ; - elif test -f /etc/lunar.release ; then - VENDOR=lunar ; - elif test -f /etc/lsb-release ; then - VENDOR=ubuntu ; - elif test -f /etc/debian_version ; then - VENDOR=debian ; - elif test -f /etc/alpine-release ; then - VENDOR=alpine ; - elif test -f /bin/freebsd-version ; then - VENDOR=freebsd ; - else - VENDOR= ; - fi + AC_ARG_WITH([vendor], + [AS_HELP_STRING([--with-vendor], + [Distribution vendor @<:@default=check@:>@])], + [with_vendor=$withval], + [with_vendor=check]) + AS_IF([test "x$with_vendor" = "xcheck"],[ + if test -f /etc/toss-release ; then + VENDOR=toss ; + elif test -f /etc/fedora-release ; then + VENDOR=fedora ; + elif test -f /etc/redhat-release ; then + VENDOR=redhat ; + elif test -f /etc/gentoo-release ; then + VENDOR=gentoo ; + elif test -f /etc/arch-release ; then + VENDOR=arch ; + elif test -f /etc/SuSE-release ; then + VENDOR=sles ; + elif test -f /etc/slackware-version ; then + VENDOR=slackware ; + elif test -f /etc/lunar.release ; then + VENDOR=lunar ; + elif test -f /etc/lsb-release ; then + VENDOR=ubuntu ; + elif test -f /etc/debian_version ; then + VENDOR=debian ; + elif test -f /etc/alpine-release ; then + VENDOR=alpine ; + elif test -f /bin/freebsd-version ; then + VENDOR=freebsd ; + else + VENDOR= ; + fi], + [ test "x${with_vendor}" != x],[ + VENDOR="$with_vendor" ], + [ VENDOR= ; ] + ) AC_MSG_RESULT([$VENDOR]) AC_SUBST(VENDOR) AC_MSG_CHECKING([default package type]) case "$VENDOR" in toss) DEFAULT_PACKAGE=rpm ;; redhat) DEFAULT_PACKAGE=rpm ;; fedora) DEFAULT_PACKAGE=rpm ;; gentoo) DEFAULT_PACKAGE=tgz ;; alpine) DEFAULT_PACKAGE=tgz ;; arch) DEFAULT_PACKAGE=tgz ;; sles) DEFAULT_PACKAGE=rpm ;; slackware) DEFAULT_PACKAGE=tgz ;; lunar) DEFAULT_PACKAGE=tgz ;; ubuntu) DEFAULT_PACKAGE=deb ;; debian) DEFAULT_PACKAGE=deb ;; freebsd) DEFAULT_PACKAGE=pkg ;; *) DEFAULT_PACKAGE=rpm ;; esac AC_MSG_RESULT([$DEFAULT_PACKAGE]) AC_SUBST(DEFAULT_PACKAGE) AC_MSG_CHECKING([default init directory]) case "$VENDOR" in freebsd) initdir=$sysconfdir/rc.d ;; *) initdir=$sysconfdir/init.d;; esac AC_MSG_RESULT([$initdir]) AC_SUBST(initdir) AC_MSG_CHECKING([default init script type and shell]) case "$VENDOR" in toss) DEFAULT_INIT_SCRIPT=redhat ;; redhat) DEFAULT_INIT_SCRIPT=redhat ;; fedora) DEFAULT_INIT_SCRIPT=fedora ;; gentoo) DEFAULT_INIT_SCRIPT=openrc ;; alpine) DEFAULT_INIT_SCRIPT=openrc ;; arch) DEFAULT_INIT_SCRIPT=lsb ;; sles) DEFAULT_INIT_SCRIPT=lsb ;; slackware) DEFAULT_INIT_SCRIPT=lsb ;; lunar) DEFAULT_INIT_SCRIPT=lunar ;; ubuntu) DEFAULT_INIT_SCRIPT=lsb ;; debian) DEFAULT_INIT_SCRIPT=lsb ;; freebsd) DEFAULT_INIT_SCRIPT=freebsd;; *) DEFAULT_INIT_SCRIPT=lsb ;; esac # On gentoo, it's possible that OpenRC isn't installed. Check if # /sbin/openrc-run exists, and if not, fall back to generic defaults. DEFAULT_INIT_SHELL="/bin/sh" AS_IF([test "$DEFAULT_INIT_SCRIPT" = "openrc"], [ AS_IF([test -x "/sbin/openrc-run"], [DEFAULT_INIT_SHELL="/sbin/openrc-run"], [DEFAULT_INIT_SCRIPT=lsb]) ]) AC_MSG_RESULT([$DEFAULT_INIT_SCRIPT:$DEFAULT_INIT_SHELL]) AC_SUBST(DEFAULT_INIT_SCRIPT) AC_SUBST(DEFAULT_INIT_SHELL) AC_MSG_CHECKING([default nfs server init script]) AS_IF([test "$VENDOR" = "debian"], [DEFAULT_INIT_NFS_SERVER="nfs-kernel-server"], [DEFAULT_INIT_NFS_SERVER="nfs"] ) AC_MSG_RESULT([$DEFAULT_INIT_NFS_SERVER]) AC_SUBST(DEFAULT_INIT_NFS_SERVER) AC_MSG_CHECKING([default init config directory]) case "$VENDOR" in alpine) initconfdir=/etc/conf.d ;; gentoo) initconfdir=/etc/conf.d ;; toss) initconfdir=/etc/sysconfig ;; redhat) initconfdir=/etc/sysconfig ;; fedora) initconfdir=/etc/sysconfig ;; sles) initconfdir=/etc/sysconfig ;; ubuntu) initconfdir=/etc/default ;; debian) initconfdir=/etc/default ;; freebsd) initconfdir=$sysconfdir/rc.conf.d;; *) initconfdir=/etc/default ;; esac AC_MSG_RESULT([$initconfdir]) AC_SUBST(initconfdir) AC_MSG_CHECKING([whether initramfs-tools is available]) if test -d /usr/share/initramfs-tools ; then DEFINE_INITRAMFS='--define "_initramfs 1"' AC_MSG_RESULT([yes]) else DEFINE_INITRAMFS='' AC_MSG_RESULT([no]) fi AC_SUBST(DEFINE_INITRAMFS) ]) dnl # dnl # Default ZFS package configuration dnl # AC_DEFUN([ZFS_AC_PACKAGE], [ ZFS_AC_DEFAULT_PACKAGE AS_IF([test x$VENDOR != xfreebsd], [ ZFS_AC_RPM ZFS_AC_DPKG ZFS_AC_ALIEN ]) ]) Index: vendor-sys/openzfs/dist/etc/systemd/system/zfs-mount.service.in =================================================================== --- vendor-sys/openzfs/dist/etc/systemd/system/zfs-mount.service.in (revision 365339) +++ vendor-sys/openzfs/dist/etc/systemd/system/zfs-mount.service.in (revision 365340) @@ -1,18 +1,17 @@ [Unit] Description=Mount ZFS filesystems Documentation=man:zfs(8) DefaultDependencies=no After=systemd-udev-settle.service After=zfs-import.target After=systemd-remount-fs.service Before=local-fs.target -Before=systemd-random-seed.service ConditionPathIsDirectory=/sys/module/zfs [Service] Type=oneshot RemainAfterExit=yes ExecStart=@sbindir@/zfs mount -a [Install] WantedBy=zfs.target Index: vendor-sys/openzfs/dist/etc/systemd/system-generators/zfs-mount-generator.in =================================================================== --- vendor-sys/openzfs/dist/etc/systemd/system-generators/zfs-mount-generator.in (revision 365339) +++ vendor-sys/openzfs/dist/etc/systemd/system-generators/zfs-mount-generator.in (revision 365340) @@ -1,450 +1,473 @@ #!/bin/sh # zfs-mount-generator - generates systemd mount units for zfs # Copyright (c) 2017 Antonio Russo # Copyright (c) 2020 InsanePrawn # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. set -e FSLIST="@sysconfdir@/zfs/zfs-list.cache" [ -d "${FSLIST}" ] || exit 0 do_fail() { printf 'zfs-mount-generator: %s\n' "$*" > /dev/kmsg exit 1 } # test if $1 is in space-separated list $2 is_known() { query="$1" IFS=' ' - # protect against special characters - set -f for element in $2 ; do if [ "$query" = "$element" ] ; then return 0 fi done return 1 } # create dependency on unit file $1 # of type $2, i.e. "wants" or "requires" # in the target units from space-separated list $3 create_dependencies() { unitfile="$1" suffix="$2" - # protect against special characters - set -f + IFS=' ' for target in $3 ; do target_dir="${dest_norm}/${target}.${suffix}/" mkdir -p "${target_dir}" ln -s "../${unitfile}" "${target_dir}" done } # see systemd.generator if [ $# -eq 0 ] ; then dest_norm="/tmp" elif [ $# -eq 3 ] ; then dest_norm="${1}" else do_fail "zero or three arguments required" fi +pools=$(zpool list -H -o name || true) # All needed information about each ZFS is available from # zfs list -H -t filesystem -o # cached in $FSLIST, and each line is processed by the following function: # See the list below for the properties and their order process_line() { # zfs list -H -o name,... # fields are tab separated IFS="$(printf '\t')" - # protect against special characters in, e.g., mountpoints - set -f # shellcheck disable=SC2086 set -- $1 + dataset="${1}" + pool="${dataset%%/*}" p_mountpoint="${2}" p_canmount="${3}" p_atime="${4}" p_relatime="${5}" p_devices="${6}" p_exec="${7}" p_readonly="${8}" p_setuid="${9}" p_nbmand="${10}" p_encroot="${11}" p_keyloc="${12}" p_systemd_requires="${13}" p_systemd_requiresmountsfor="${14}" p_systemd_before="${15}" p_systemd_after="${16}" p_systemd_wantedby="${17}" p_systemd_requiredby="${18}" p_systemd_nofail="${19}" p_systemd_ignore="${20}" # Minimal pre-requisites to mount a ZFS dataset # By ordering before zfs-mount.service, we avoid race conditions. after="zfs-import.target" before="zfs-mount.service" wants="zfs-import.target" requires="" requiredmounts="" bindsto="" wantedby="" requiredby="" noauto="off" + # If the pool is already imported, zfs-import.target is not needed. This + # avoids a dependency loop on root-on-ZFS systems: + # systemd-random-seed.service After (via RequiresMountsFor) var-lib.mount + # After zfs-import.target After zfs-import-{cache,scan}.service After + # cryptsetup.service After systemd-random-seed.service. + # + # Pools are newline-separated and may contain spaces in their names. + # There is no better portable way to set IFS to just a newline. Using + # $(printf '\n') doesn't work because $(...) strips trailing newlines. + IFS=" +" + for p in $pools ; do + if [ "$p" = "$pool" ] ; then + after="" + wants="" + break + fi + done + if [ -n "${p_systemd_after}" ] && \ [ "${p_systemd_after}" != "-" ] ; then after="${p_systemd_after} ${after}" fi if [ -n "${p_systemd_before}" ] && \ [ "${p_systemd_before}" != "-" ] ; then before="${p_systemd_before} ${before}" fi if [ -n "${p_systemd_requires}" ] && \ [ "${p_systemd_requires}" != "-" ] ; then requires="Requires=${p_systemd_requires}" fi if [ -n "${p_systemd_requiresmountsfor}" ] && \ [ "${p_systemd_requiresmountsfor}" != "-" ] ; then requiredmounts="RequiresMountsFor=${p_systemd_requiresmountsfor}" fi # Handle encryption if [ -n "${p_encroot}" ] && [ "${p_encroot}" != "-" ] ; then keyloadunit="zfs-load-key-$(systemd-escape "${p_encroot}").service" if [ "${p_encroot}" = "${dataset}" ] ; then keymountdep="" if [ "${p_keyloc%%://*}" = "file" ] ; then if [ -n "${requiredmounts}" ] ; then keymountdep="${requiredmounts} '${p_keyloc#file://}'" else keymountdep="RequiresMountsFor='${p_keyloc#file://}'" fi keyloadscript="@sbindir@/zfs load-key \"${dataset}\"" elif [ "${p_keyloc}" = "prompt" ] ; then keyloadscript="\ count=0;\ while [ \$\$count -lt 3 ];do\ systemd-ask-password --id=\"zfs:${dataset}\"\ \"Enter passphrase for ${dataset}:\"|\ @sbindir@/zfs load-key \"${dataset}\" && exit 0;\ count=\$\$((count + 1));\ done;\ exit 1" else printf 'zfs-mount-generator: (%s) invalid keylocation\n' \ "${dataset}" >/dev/kmsg fi keyloadcmd="\ /bin/sh -c '\ set -eu;\ keystatus=\"\$\$(@sbindir@/zfs get -H -o value keystatus \"${dataset}\")\";\ [ \"\$\$keystatus\" = \"unavailable\" ] || exit 0;\ ${keyloadscript}'" keyunloadcmd="\ /bin/sh -c '\ set -eu;\ keystatus=\"\$\$(@sbindir@/zfs get -H -o value keystatus \"${dataset}\")\";\ [ \"\$\$keystatus\" = \"available\" ] || exit 0;\ @sbindir@/zfs unload-key \"${dataset}\"'" # Generate the key-load .service unit # # Note: It is tempting to use a `< "${dest_norm}/${keyloadunit}" fi # Update the dependencies for the mount file to want the # key-loading unit. wants="${wants}" bindsto="BindsTo=${keyloadunit}" after="${after} ${keyloadunit}" fi # Prepare the .mount unit # skip generation of the mount unit if org.openzfs.systemd:ignore is "on" if [ -n "${p_systemd_ignore}" ] ; then if [ "${p_systemd_ignore}" = "on" ] ; then return elif [ "${p_systemd_ignore}" = "-" ] \ || [ "${p_systemd_ignore}" = "off" ] ; then : # This is OK else do_fail "invalid org.openzfs.systemd:ignore for ${dataset}" fi fi # Check for canmount=off . if [ "${p_canmount}" = "off" ] ; then return elif [ "${p_canmount}" = "noauto" ] ; then noauto="on" elif [ "${p_canmount}" = "on" ] ; then : # This is OK else do_fail "invalid canmount for ${dataset}" fi # Check for legacy and blank mountpoints. if [ "${p_mountpoint}" = "legacy" ] ; then return elif [ "${p_mountpoint}" = "none" ] ; then return elif [ "${p_mountpoint%"${p_mountpoint#?}"}" != "/" ] ; then do_fail "invalid mountpoint for ${dataset}" fi # Escape the mountpoint per systemd policy. mountfile="$(systemd-escape --path --suffix=mount "${p_mountpoint}")" # Parse options # see lib/libzfs/libzfs_mount.c:zfs_add_options opts="" # atime if [ "${p_atime}" = on ] ; then # relatime if [ "${p_relatime}" = on ] ; then opts="${opts},atime,relatime" elif [ "${p_relatime}" = off ] ; then opts="${opts},atime,strictatime" else printf 'zfs-mount-generator: (%s) invalid relatime\n' \ "${dataset}" >/dev/kmsg fi elif [ "${p_atime}" = off ] ; then opts="${opts},noatime" else printf 'zfs-mount-generator: (%s) invalid atime\n' \ "${dataset}" >/dev/kmsg fi # devices if [ "${p_devices}" = on ] ; then opts="${opts},dev" elif [ "${p_devices}" = off ] ; then opts="${opts},nodev" else printf 'zfs-mount-generator: (%s) invalid devices\n' \ "${dataset}" >/dev/kmsg fi # exec if [ "${p_exec}" = on ] ; then opts="${opts},exec" elif [ "${p_exec}" = off ] ; then opts="${opts},noexec" else printf 'zfs-mount-generator: (%s) invalid exec\n' \ "${dataset}" >/dev/kmsg fi # readonly if [ "${p_readonly}" = on ] ; then opts="${opts},ro" elif [ "${p_readonly}" = off ] ; then opts="${opts},rw" else printf 'zfs-mount-generator: (%s) invalid readonly\n' \ "${dataset}" >/dev/kmsg fi # setuid if [ "${p_setuid}" = on ] ; then opts="${opts},suid" elif [ "${p_setuid}" = off ] ; then opts="${opts},nosuid" else printf 'zfs-mount-generator: (%s) invalid setuid\n' \ "${dataset}" >/dev/kmsg fi # nbmand if [ "${p_nbmand}" = on ] ; then opts="${opts},mand" elif [ "${p_nbmand}" = off ] ; then opts="${opts},nomand" else printf 'zfs-mount-generator: (%s) invalid nbmand\n' \ "${dataset}" >/dev/kmsg fi if [ -n "${p_systemd_wantedby}" ] && \ [ "${p_systemd_wantedby}" != "-" ] ; then noauto="on" if [ "${p_systemd_wantedby}" = "none" ] ; then wantedby="" else wantedby="${p_systemd_wantedby}" before="${before} ${wantedby}" fi fi if [ -n "${p_systemd_requiredby}" ] && \ [ "${p_systemd_requiredby}" != "-" ] ; then noauto="on" if [ "${p_systemd_requiredby}" = "none" ] ; then requiredby="" else requiredby="${p_systemd_requiredby}" before="${before} ${requiredby}" fi fi # For datasets with canmount=on, a dependency is created for # local-fs.target by default. To avoid regressions, this dependency # is reduced to "wants" rather than "requires" when nofail is not "off". # **THIS MAY CHANGE** # noauto=on disables this behavior completely. if [ "${noauto}" != "on" ] ; then if [ "${p_systemd_nofail}" = "off" ] ; then requiredby="local-fs.target" before="${before} local-fs.target" else wantedby="local-fs.target" if [ "${p_systemd_nofail}" != "on" ] ; then before="${before} local-fs.target" fi fi fi # Handle existing files: # 1. We never overwrite existing files, although we may delete # files if we're sure they were created by us. (see 5.) # 2. We handle files differently based on canmount. Units with canmount=on # always have precedence over noauto. This is enforced by the sort pipe # in the loop around this function. # It is important to use $p_canmount and not $noauto here, since we # sort by canmount while other properties also modify $noauto, e.g. # org.openzfs.systemd:wanted-by. # 3. If no unit file exists for a noauto dataset, we create one. # Additionally, we use $noauto_files to track the unit file names # (which are the systemd-escaped mountpoints) of all (exclusively) # noauto datasets that had a file created. # 4. If the file to be created is found in the tracking variable, # we do NOT create it. # 5. If a file exists for a noauto dataset, we check whether the file # name is in the variable. If it is, we have multiple noauto datasets # for the same mountpoint. In such cases, we remove the file for safety. # To avoid further noauto datasets creating a file for this path again, # we leave the file name in the tracking variable. if [ -e "${dest_norm}/${mountfile}" ] ; then if is_known "$mountfile" "$noauto_files" ; then # if it's in $noauto_files, we must be noauto too. See 2. printf 'zfs-mount-generator: removing duplicate noauto %s\n' \ "${mountfile}" >/dev/kmsg # See 5. rm "${dest_norm}/${mountfile}" else # don't log for canmount=noauto if [ "${p_canmount}" = "on" ] ; then printf 'zfs-mount-generator: %s already exists. Skipping.\n' \ "${mountfile}" >/dev/kmsg fi fi # file exists; Skip current dataset. return else if is_known "${mountfile}" "${noauto_files}" ; then # See 4. return elif [ "${p_canmount}" = "noauto" ] ; then noauto_files="${mountfile} ${noauto_files}" fi fi # Create the .mount unit file. # # (Do not use `< "${dest_norm}/${mountfile}" # Finally, create the appropriate dependencies create_dependencies "${mountfile}" "wants" "$wantedby" create_dependencies "${mountfile}" "requires" "$requiredby" } for cachefile in "${FSLIST}/"* ; do + # Disable glob expansion to protect against special characters when parsing. + set -f # Sort cachefile's lines by canmount, "on" before "noauto" # and feed each line into process_line sort -t "$(printf '\t')" -k 3 -r "${cachefile}" | \ ( # subshell is necessary for `sort|while read` and $noauto_files noauto_files="" while read -r fs ; do process_line "${fs}" done ) done Index: vendor-sys/openzfs/dist/include/libzfs.h =================================================================== --- vendor-sys/openzfs/dist/include/libzfs.h (revision 365339) +++ vendor-sys/openzfs/dist/include/libzfs.h (revision 365340) @@ -1,920 +1,932 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright Joyent, Inc. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2016, Intel Corporation. * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. */ #ifndef _LIBZFS_H #define _LIBZFS_H #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Miscellaneous ZFS constants */ #define ZFS_MAXPROPLEN MAXPATHLEN #define ZPOOL_MAXPROPLEN MAXPATHLEN /* * libzfs errors */ typedef enum zfs_error { EZFS_SUCCESS = 0, /* no error -- success */ EZFS_NOMEM = 2000, /* out of memory */ EZFS_BADPROP, /* invalid property value */ EZFS_PROPREADONLY, /* cannot set readonly property */ EZFS_PROPTYPE, /* property does not apply to dataset type */ EZFS_PROPNONINHERIT, /* property is not inheritable */ EZFS_PROPSPACE, /* bad quota or reservation */ EZFS_BADTYPE, /* dataset is not of appropriate type */ EZFS_BUSY, /* pool or dataset is busy */ EZFS_EXISTS, /* pool or dataset already exists */ EZFS_NOENT, /* no such pool or dataset */ EZFS_BADSTREAM, /* bad backup stream */ EZFS_DSREADONLY, /* dataset is readonly */ EZFS_VOLTOOBIG, /* volume is too large for 32-bit system */ EZFS_INVALIDNAME, /* invalid dataset name */ EZFS_BADRESTORE, /* unable to restore to destination */ EZFS_BADBACKUP, /* backup failed */ EZFS_BADTARGET, /* bad attach/detach/replace target */ EZFS_NODEVICE, /* no such device in pool */ EZFS_BADDEV, /* invalid device to add */ EZFS_NOREPLICAS, /* no valid replicas */ EZFS_RESILVERING, /* resilvering (healing reconstruction) */ EZFS_BADVERSION, /* unsupported version */ EZFS_POOLUNAVAIL, /* pool is currently unavailable */ EZFS_DEVOVERFLOW, /* too many devices in one vdev */ EZFS_BADPATH, /* must be an absolute path */ EZFS_CROSSTARGET, /* rename or clone across pool or dataset */ EZFS_ZONED, /* used improperly in local zone */ EZFS_MOUNTFAILED, /* failed to mount dataset */ EZFS_UMOUNTFAILED, /* failed to unmount dataset */ EZFS_UNSHARENFSFAILED, /* unshare(1M) failed */ EZFS_SHARENFSFAILED, /* share(1M) failed */ EZFS_PERM, /* permission denied */ EZFS_NOSPC, /* out of space */ EZFS_FAULT, /* bad address */ EZFS_IO, /* I/O error */ EZFS_INTR, /* signal received */ EZFS_ISSPARE, /* device is a hot spare */ EZFS_INVALCONFIG, /* invalid vdev configuration */ EZFS_RECURSIVE, /* recursive dependency */ EZFS_NOHISTORY, /* no history object */ EZFS_POOLPROPS, /* couldn't retrieve pool props */ EZFS_POOL_NOTSUP, /* ops not supported for this type of pool */ EZFS_POOL_INVALARG, /* invalid argument for this pool operation */ EZFS_NAMETOOLONG, /* dataset name is too long */ EZFS_OPENFAILED, /* open of device failed */ EZFS_NOCAP, /* couldn't get capacity */ EZFS_LABELFAILED, /* write of label failed */ EZFS_BADWHO, /* invalid permission who */ EZFS_BADPERM, /* invalid permission */ EZFS_BADPERMSET, /* invalid permission set name */ EZFS_NODELEGATION, /* delegated administration is disabled */ EZFS_UNSHARESMBFAILED, /* failed to unshare over smb */ EZFS_SHARESMBFAILED, /* failed to share over smb */ EZFS_BADCACHE, /* bad cache file */ EZFS_ISL2CACHE, /* device is for the level 2 ARC */ EZFS_VDEVNOTSUP, /* unsupported vdev type */ EZFS_NOTSUP, /* ops not supported on this dataset */ EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */ EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */ EZFS_REFTAG_RELE, /* snapshot release: tag not found */ EZFS_REFTAG_HOLD, /* snapshot hold: tag already exists */ EZFS_TAGTOOLONG, /* snapshot hold/rele: tag too long */ EZFS_PIPEFAILED, /* pipe create failed */ EZFS_THREADCREATEFAILED, /* thread create failed */ EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */ EZFS_SCRUBBING, /* currently scrubbing */ EZFS_NO_SCRUB, /* no active scrub */ EZFS_DIFF, /* general failure of zfs diff */ EZFS_DIFFDATA, /* bad zfs diff data */ EZFS_POOLREADONLY, /* pool is in read-only mode */ EZFS_SCRUB_PAUSED, /* scrub currently paused */ EZFS_ACTIVE_POOL, /* pool is imported on a different system */ EZFS_CRYPTOFAILED, /* failed to setup encryption */ EZFS_NO_PENDING, /* cannot cancel, no operation is pending */ EZFS_CHECKPOINT_EXISTS, /* checkpoint exists */ EZFS_DISCARDING_CHECKPOINT, /* currently discarding a checkpoint */ EZFS_NO_CHECKPOINT, /* pool has no checkpoint */ EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */ EZFS_VDEV_TOO_BIG, /* a device is too big to be used */ EZFS_IOC_NOTSUPPORTED, /* operation not supported by zfs module */ EZFS_TOOMANY, /* argument list too long */ EZFS_INITIALIZING, /* currently initializing */ EZFS_NO_INITIALIZE, /* no active initialize */ EZFS_WRONG_PARENT, /* invalid parent dataset (e.g ZVOL) */ EZFS_TRIMMING, /* currently trimming */ EZFS_NO_TRIM, /* no active trim */ EZFS_TRIM_NOTSUP, /* device does not support trim */ EZFS_NO_RESILVER_DEFER, /* pool doesn't support resilver_defer */ EZFS_EXPORT_IN_PROGRESS, /* currently exporting the pool */ EZFS_REBUILDING, /* resilvering (sequential reconstrution) */ EZFS_UNKNOWN } zfs_error_t; /* * The following data structures are all part * of the zfs_allow_t data structure which is * used for printing 'allow' permissions. * It is a linked list of zfs_allow_t's which * then contain avl tree's for user/group/sets/... * and each one of the entries in those trees have * avl tree's for the permissions they belong to and * whether they are local,descendent or local+descendent * permissions. The AVL trees are used primarily for * sorting purposes, but also so that we can quickly find * a given user and or permission. */ typedef struct zfs_perm_node { avl_node_t z_node; char z_pname[MAXPATHLEN]; } zfs_perm_node_t; typedef struct zfs_allow_node { avl_node_t z_node; char z_key[MAXPATHLEN]; /* name, such as joe */ avl_tree_t z_localdescend; /* local+descendent perms */ avl_tree_t z_local; /* local permissions */ avl_tree_t z_descend; /* descendent permissions */ } zfs_allow_node_t; typedef struct zfs_allow { struct zfs_allow *z_next; char z_setpoint[MAXPATHLEN]; avl_tree_t z_sets; avl_tree_t z_crperms; avl_tree_t z_user; avl_tree_t z_group; avl_tree_t z_everyone; } zfs_allow_t; /* * Basic handle types */ typedef struct zfs_handle zfs_handle_t; typedef struct zpool_handle zpool_handle_t; typedef struct libzfs_handle libzfs_handle_t; extern int zpool_wait(zpool_handle_t *, zpool_wait_activity_t); extern int zpool_wait_status(zpool_handle_t *, zpool_wait_activity_t, boolean_t *, boolean_t *); /* * Library initialization */ extern libzfs_handle_t *libzfs_init(void); extern void libzfs_fini(libzfs_handle_t *); extern libzfs_handle_t *zpool_get_handle(zpool_handle_t *); extern libzfs_handle_t *zfs_get_handle(zfs_handle_t *); extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t); extern void zfs_save_arguments(int argc, char **, char *, int); extern int zpool_log_history(libzfs_handle_t *, const char *); extern int libzfs_errno(libzfs_handle_t *); extern const char *libzfs_error_init(int); extern const char *libzfs_error_action(libzfs_handle_t *); extern const char *libzfs_error_description(libzfs_handle_t *); extern int zfs_standard_error(libzfs_handle_t *, int, const char *); extern void libzfs_mnttab_init(libzfs_handle_t *); extern void libzfs_mnttab_fini(libzfs_handle_t *); extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t); extern int libzfs_mnttab_find(libzfs_handle_t *, const char *, struct mnttab *); extern void libzfs_mnttab_add(libzfs_handle_t *, const char *, const char *, const char *); extern void libzfs_mnttab_remove(libzfs_handle_t *, const char *); /* * Basic handle functions */ extern zpool_handle_t *zpool_open(libzfs_handle_t *, const char *); extern zpool_handle_t *zpool_open_canfail(libzfs_handle_t *, const char *); extern void zpool_close(zpool_handle_t *); extern const char *zpool_get_name(zpool_handle_t *); extern int zpool_get_state(zpool_handle_t *); extern const char *zpool_state_to_name(vdev_state_t, vdev_aux_t); extern const char *zpool_pool_state_to_name(pool_state_t); extern void zpool_free_handles(libzfs_handle_t *); /* * Iterate over all active pools in the system. */ typedef int (*zpool_iter_f)(zpool_handle_t *, void *); extern int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *); extern boolean_t zpool_skip_pool(const char *); /* * Functions to create and destroy pools */ extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, nvlist_t *, nvlist_t *); extern int zpool_destroy(zpool_handle_t *, const char *); extern int zpool_add(zpool_handle_t *, nvlist_t *); typedef struct splitflags { /* do not split, but return the config that would be split off */ int dryrun : 1; /* after splitting, import the pool */ int import : 1; int name_flags; } splitflags_t; typedef struct trimflags { /* requested vdevs are for the entire pool */ boolean_t fullpool; /* request a secure trim, requires support from device */ boolean_t secure; /* after starting trim, block until trim completes */ boolean_t wait; /* trim at the requested rate in bytes/second */ uint64_t rate; } trimflags_t; /* * Functions to manipulate pool and vdev state */ extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t); extern int zpool_initialize(zpool_handle_t *, pool_initialize_func_t, nvlist_t *); extern int zpool_initialize_wait(zpool_handle_t *, pool_initialize_func_t, nvlist_t *); extern int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *, trimflags_t *); extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); extern int zpool_reguid(zpool_handle_t *); extern int zpool_reopen_one(zpool_handle_t *, void *); extern int zpool_sync_one(zpool_handle_t *, void *); extern int zpool_vdev_online(zpool_handle_t *, const char *, int, vdev_state_t *); extern int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); extern int zpool_vdev_attach(zpool_handle_t *, const char *, const char *, nvlist_t *, int, boolean_t); extern int zpool_vdev_detach(zpool_handle_t *, const char *); extern int zpool_vdev_remove(zpool_handle_t *, const char *); extern int zpool_vdev_remove_cancel(zpool_handle_t *); extern int zpool_vdev_indirect_size(zpool_handle_t *, const char *, uint64_t *); extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *, splitflags_t); extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t); extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t); extern int zpool_vdev_clear(zpool_handle_t *, uint64_t); extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, const char *); extern uint64_t zpool_vdev_path_to_guid(zpool_handle_t *zhp, const char *path); const char *zpool_get_state_str(zpool_handle_t *); /* * Functions to manage pool properties */ extern int zpool_set_prop(zpool_handle_t *, const char *, const char *); extern int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *, size_t proplen, zprop_source_t *, boolean_t literal); extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t, zprop_source_t *); extern int zpool_props_refresh(zpool_handle_t *); extern const char *zpool_prop_to_name(zpool_prop_t); extern const char *zpool_prop_values(zpool_prop_t); /* * Pool health statistics. */ typedef enum { /* * The following correspond to faults as defined in the (fault.fs.zfs.*) * event namespace. Each is associated with a corresponding message ID. * This must be kept in sync with the zfs_msgid_table in * lib/libzfs/libzfs_status.c. */ ZPOOL_STATUS_CORRUPT_CACHE, /* corrupt /kernel/drv/zpool.cache */ ZPOOL_STATUS_MISSING_DEV_R, /* missing device with replicas */ ZPOOL_STATUS_MISSING_DEV_NR, /* missing device with no replicas */ ZPOOL_STATUS_CORRUPT_LABEL_R, /* bad device label with replicas */ ZPOOL_STATUS_CORRUPT_LABEL_NR, /* bad device label with no replicas */ ZPOOL_STATUS_BAD_GUID_SUM, /* sum of device guids didn't match */ ZPOOL_STATUS_CORRUPT_POOL, /* pool metadata is corrupted */ ZPOOL_STATUS_CORRUPT_DATA, /* data errors in user (meta)data */ ZPOOL_STATUS_FAILING_DEV, /* device experiencing errors */ ZPOOL_STATUS_VERSION_NEWER, /* newer on-disk version */ ZPOOL_STATUS_HOSTID_MISMATCH, /* last accessed by another system */ ZPOOL_STATUS_HOSTID_ACTIVE, /* currently active on another system */ ZPOOL_STATUS_HOSTID_REQUIRED, /* multihost=on and hostid=0 */ ZPOOL_STATUS_IO_FAILURE_WAIT, /* failed I/O, failmode 'wait' */ ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */ ZPOOL_STATUS_IO_FAILURE_MMP, /* failed MMP, failmode not 'panic' */ ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */ ZPOOL_STATUS_ERRATA, /* informational errata available */ /* * If the pool has unsupported features but can still be opened in * read-only mode, its status is ZPOOL_STATUS_UNSUP_FEAT_WRITE. If the * pool has unsupported features but cannot be opened at all, its * status is ZPOOL_STATUS_UNSUP_FEAT_READ. */ ZPOOL_STATUS_UNSUP_FEAT_READ, /* unsupported features for read */ ZPOOL_STATUS_UNSUP_FEAT_WRITE, /* unsupported features for write */ /* * These faults have no corresponding message ID. At the time we are * checking the status, the original reason for the FMA fault (I/O or * checksum errors) has been lost. */ ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */ ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */ /* * The following are not faults per se, but still an error possibly * requiring administrative attention. There is no corresponding * message ID. */ ZPOOL_STATUS_VERSION_OLDER, /* older legacy on-disk version */ ZPOOL_STATUS_FEAT_DISABLED, /* supported features are disabled */ ZPOOL_STATUS_RESILVERING, /* device being resilvered */ ZPOOL_STATUS_OFFLINE_DEV, /* device offline */ ZPOOL_STATUS_REMOVED_DEV, /* removed device */ ZPOOL_STATUS_REBUILDING, /* device being rebuilt */ ZPOOL_STATUS_REBUILD_SCRUB, /* recommend scrubbing the pool */ ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */ /* * Finally, the following indicates a healthy pool. */ ZPOOL_STATUS_OK } zpool_status_t; extern zpool_status_t zpool_get_status(zpool_handle_t *, char **, zpool_errata_t *); extern zpool_status_t zpool_import_status(nvlist_t *, char **, zpool_errata_t *); /* * Statistics and configuration functions. */ extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); extern nvlist_t *zpool_get_features(zpool_handle_t *); extern int zpool_refresh_stats(zpool_handle_t *, boolean_t *); extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **); /* * Import and export functions */ extern int zpool_export(zpool_handle_t *, boolean_t, const char *); extern int zpool_export_force(zpool_handle_t *, const char *); extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, char *altroot); extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, nvlist_t *, int); extern void zpool_print_unsup_feat(nvlist_t *config); /* * Miscellaneous pool functions */ struct zfs_cmd; extern const char *zfs_history_event_names[]; typedef enum { VDEV_NAME_PATH = 1 << 0, VDEV_NAME_GUID = 1 << 1, VDEV_NAME_FOLLOW_LINKS = 1 << 2, VDEV_NAME_TYPE_ID = 1 << 3, } vdev_name_t; extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, int name_flags); extern int zpool_upgrade(zpool_handle_t *, uint64_t); extern int zpool_get_history(zpool_handle_t *, nvlist_t **, uint64_t *, boolean_t *); extern int zpool_events_next(libzfs_handle_t *, nvlist_t **, int *, unsigned, int); extern int zpool_events_clear(libzfs_handle_t *, int *); extern int zpool_events_seek(libzfs_handle_t *, uint64_t, int); extern void zpool_obj_to_path_ds(zpool_handle_t *, uint64_t, uint64_t, char *, size_t); extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, size_t); extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *); extern int zpool_get_physpath(zpool_handle_t *, char *, size_t); extern void zpool_explain_recover(libzfs_handle_t *, const char *, int, nvlist_t *); extern int zpool_checkpoint(zpool_handle_t *); extern int zpool_discard_checkpoint(zpool_handle_t *); /* * Basic handle manipulations. These functions do not create or destroy the * underlying datasets, only the references to them. */ extern zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int); extern zfs_handle_t *zfs_handle_dup(zfs_handle_t *); extern void zfs_close(zfs_handle_t *); extern zfs_type_t zfs_get_type(const zfs_handle_t *); extern const char *zfs_get_name(const zfs_handle_t *); extern zpool_handle_t *zfs_get_pool_handle(const zfs_handle_t *); extern const char *zfs_get_pool_name(const zfs_handle_t *); /* * Property management functions. Some functions are shared with the kernel, * and are found in sys/fs/zfs.h. */ /* * zfs dataset property management */ extern const char *zfs_prop_default_string(zfs_prop_t); extern uint64_t zfs_prop_default_numeric(zfs_prop_t); extern const char *zfs_prop_column_name(zfs_prop_t); extern boolean_t zfs_prop_align_right(zfs_prop_t); extern nvlist_t *zfs_valid_proplist(libzfs_handle_t *, zfs_type_t, nvlist_t *, uint64_t, zfs_handle_t *, zpool_handle_t *, boolean_t, const char *); extern const char *zfs_prop_to_name(zfs_prop_t); extern int zfs_prop_set(zfs_handle_t *, const char *, const char *); extern int zfs_prop_set_list(zfs_handle_t *, nvlist_t *); extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t, zprop_source_t *, char *, size_t, boolean_t); extern int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t, boolean_t); extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *, zprop_source_t *, char *, size_t); extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue); extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal); extern int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue); extern int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal); extern int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname, char *buf, size_t len); extern uint64_t getprop_uint64(zfs_handle_t *, zfs_prop_t, char **); extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t); extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t); extern const char *zfs_prop_values(zfs_prop_t); extern int zfs_prop_is_string(zfs_prop_t prop); extern nvlist_t *zfs_get_all_props(zfs_handle_t *); extern nvlist_t *zfs_get_user_props(zfs_handle_t *); extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *); extern nvlist_t *zfs_get_clones_nvl(zfs_handle_t *); extern int zfs_wait_status(zfs_handle_t *, zfs_wait_activity_t, boolean_t *, boolean_t *); /* * zfs encryption management */ extern int zfs_crypto_get_encryption_root(zfs_handle_t *, boolean_t *, char *); extern int zfs_crypto_create(libzfs_handle_t *, char *, nvlist_t *, nvlist_t *, boolean_t stdin_available, uint8_t **, uint_t *); extern int zfs_crypto_clone_check(libzfs_handle_t *, zfs_handle_t *, char *, nvlist_t *); extern int zfs_crypto_attempt_load_keys(libzfs_handle_t *, char *); extern int zfs_crypto_load_key(zfs_handle_t *, boolean_t, char *); extern int zfs_crypto_unload_key(zfs_handle_t *); extern int zfs_crypto_rewrap(zfs_handle_t *, nvlist_t *, boolean_t); typedef struct zprop_list { int pl_prop; char *pl_user_prop; struct zprop_list *pl_next; boolean_t pl_all; size_t pl_width; size_t pl_recvd_width; boolean_t pl_fixed; } zprop_list_t; extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t, boolean_t); extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *); #define ZFS_MOUNTPOINT_NONE "none" #define ZFS_MOUNTPOINT_LEGACY "legacy" #define ZFS_FEATURE_DISABLED "disabled" #define ZFS_FEATURE_ENABLED "enabled" #define ZFS_FEATURE_ACTIVE "active" #define ZFS_UNSUPPORTED_INACTIVE "inactive" #define ZFS_UNSUPPORTED_READONLY "readonly" /* * zpool property management */ extern int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **); extern int zpool_prop_get_feature(zpool_handle_t *, const char *, char *, size_t); extern const char *zpool_prop_default_string(zpool_prop_t); extern uint64_t zpool_prop_default_numeric(zpool_prop_t); extern const char *zpool_prop_column_name(zpool_prop_t); extern boolean_t zpool_prop_align_right(zpool_prop_t); /* * Functions shared by zfs and zpool property management. */ extern int zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered, zfs_type_t type); extern int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **, zfs_type_t); extern void zprop_free_list(zprop_list_t *); #define ZFS_GET_NCOLS 5 typedef enum { GET_COL_NONE, GET_COL_NAME, GET_COL_PROPERTY, GET_COL_VALUE, GET_COL_RECVD, GET_COL_SOURCE } zfs_get_column_t; /* * Functions for printing zfs or zpool properties */ typedef struct zprop_get_cbdata { int cb_sources; zfs_get_column_t cb_columns[ZFS_GET_NCOLS]; int cb_colwidths[ZFS_GET_NCOLS + 1]; boolean_t cb_scripted; boolean_t cb_literal; boolean_t cb_first; zprop_list_t *cb_proplist; zfs_type_t cb_type; } zprop_get_cbdata_t; void zprop_print_one_property(const char *, zprop_get_cbdata_t *, const char *, const char *, zprop_source_t, const char *, const char *); /* * Iterator functions. */ typedef int (*zfs_iter_f)(zfs_handle_t *, void *); extern int zfs_iter_root(libzfs_handle_t *, zfs_iter_f, void *); extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *); extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *); extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *); extern int zfs_iter_snapshots(zfs_handle_t *, boolean_t, zfs_iter_f, void *, uint64_t, uint64_t); extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *, uint64_t, uint64_t); extern int zfs_iter_snapspec(zfs_handle_t *, const char *, zfs_iter_f, void *); extern int zfs_iter_bookmarks(zfs_handle_t *, zfs_iter_f, void *); extern int zfs_iter_mounted(zfs_handle_t *, zfs_iter_f, void *); typedef struct get_all_cb { zfs_handle_t **cb_handles; size_t cb_alloc; size_t cb_used; } get_all_cb_t; void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **, size_t, zfs_iter_f, void *, boolean_t); void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *); /* * Functions to create and destroy datasets. */ extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t, nvlist_t *); extern int zfs_create_ancestors(libzfs_handle_t *, const char *); extern int zfs_destroy(zfs_handle_t *, boolean_t); extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); extern int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t); extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *); extern int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props); extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); -extern int zfs_rename(zfs_handle_t *, const char *, boolean_t, boolean_t); + +typedef struct renameflags { + /* recursive rename */ + int recursive : 1; + + /* don't unmount file systems */ + int nounmount : 1; + + /* force unmount file systems */ + int forceunmount : 1; +} renameflags_t; + +extern int zfs_rename(zfs_handle_t *, const char *, renameflags_t); typedef struct sendflags { /* Amount of extra information to print. */ int verbosity; /* recursive send (ie, -R) */ boolean_t replicate; /* for incrementals, do all intermediate snapshots */ boolean_t doall; /* if dataset is a clone, do incremental from its origin */ boolean_t fromorigin; /* field no longer used, maintained for backwards compatibility */ boolean_t pad; /* send properties (ie, -p) */ boolean_t props; /* do not send (no-op, ie. -n) */ boolean_t dryrun; /* parsable verbose output (ie. -P) */ boolean_t parsable; /* show progress (ie. -v) */ boolean_t progress; /* large blocks (>128K) are permitted */ boolean_t largeblock; /* WRITE_EMBEDDED records of type DATA are permitted */ boolean_t embed_data; /* compressed WRITE records are permitted */ boolean_t compress; /* raw encrypted records are permitted */ boolean_t raw; /* only send received properties (ie. -b) */ boolean_t backup; /* include snapshot holds in send stream */ boolean_t holds; /* stream represents a partially received dataset */ boolean_t saved; } sendflags_t; typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); extern int zfs_send(zfs_handle_t *, const char *, const char *, sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **); extern int zfs_send_one(zfs_handle_t *, const char *, int, sendflags_t *, const char *); extern int zfs_send_progress(zfs_handle_t *, int, uint64_t *, uint64_t *); extern int zfs_send_resume(libzfs_handle_t *, sendflags_t *, int outfd, const char *); extern int zfs_send_saved(zfs_handle_t *, sendflags_t *, int, const char *); extern nvlist_t *zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token); extern int zfs_promote(zfs_handle_t *); extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t, int); extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *); extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); extern int zfs_get_holds(zfs_handle_t *, nvlist_t **); extern uint64_t zvol_volsize_to_reservation(zpool_handle_t *, uint64_t, nvlist_t *); typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, uid_t rid, uint64_t space); extern int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t, zfs_userspace_cb_t, void *); extern int zfs_get_fsacl(zfs_handle_t *, nvlist_t **); extern int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *); typedef struct recvflags { /* print informational messages (ie, -v was specified) */ boolean_t verbose; /* the destination is a prefix, not the exact fs (ie, -d) */ boolean_t isprefix; /* * Only the tail of the sent snapshot path is appended to the * destination to determine the received snapshot name (ie, -e). */ boolean_t istail; /* do not actually do the recv, just check if it would work (ie, -n) */ boolean_t dryrun; /* rollback/destroy filesystems as necessary (eg, -F) */ boolean_t force; /* set "canmount=off" on all modified filesystems */ boolean_t canmountoff; /* * Mark the file systems as "resumable" and do not destroy them if the * receive is interrupted */ boolean_t resumable; /* byteswap flag is used internally; callers need not specify */ boolean_t byteswap; /* do not mount file systems as they are extracted (private) */ boolean_t nomount; /* Was holds flag set in the compound header? */ boolean_t holds; /* skip receive of snapshot holds */ boolean_t skipholds; /* mount the filesystem unless nomount is specified */ boolean_t domount; /* force unmount while recv snapshot (private) */ boolean_t forceunmount; } recvflags_t; extern int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *, recvflags_t *, int, avl_tree_t *); typedef enum diff_flags { ZFS_DIFF_PARSEABLE = 0x1, ZFS_DIFF_TIMESTAMP = 0x2, ZFS_DIFF_CLASSIFY = 0x4 } diff_flags_t; extern int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *, int); /* * Miscellaneous functions. */ extern const char *zfs_type_to_name(zfs_type_t); extern void zfs_refresh_properties(zfs_handle_t *); extern int zfs_name_valid(const char *, zfs_type_t); extern zfs_handle_t *zfs_path_to_zhandle(libzfs_handle_t *, const char *, zfs_type_t); extern int zfs_parent_name(zfs_handle_t *, char *, size_t); extern boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *, zfs_type_t); extern int zfs_spa_version(zfs_handle_t *, int *); extern boolean_t zfs_bookmark_exists(const char *path); /* * Mount support functions. */ extern boolean_t is_mounted(libzfs_handle_t *, const char *special, char **); extern boolean_t zfs_is_mounted(zfs_handle_t *, char **); extern int zfs_mount(zfs_handle_t *, const char *, int); extern int zfs_mount_at(zfs_handle_t *, const char *, int, const char *); extern int zfs_unmount(zfs_handle_t *, const char *, int); extern int zfs_unmountall(zfs_handle_t *, int); #if defined(__linux__) extern int zfs_parse_mount_options(char *mntopts, unsigned long *mntflags, unsigned long *zfsflags, int sloppy, char *badopt, char *mtabopt); extern void zfs_adjust_mount_options(zfs_handle_t *zhp, const char *mntpoint, char *mntopts, char *mtabopt); #endif /* * Share support functions. */ extern boolean_t zfs_is_shared(zfs_handle_t *); extern int zfs_share(zfs_handle_t *); extern int zfs_unshare(zfs_handle_t *); /* * Protocol-specific share support functions. */ extern boolean_t zfs_is_shared_nfs(zfs_handle_t *, char **); extern boolean_t zfs_is_shared_smb(zfs_handle_t *, char **); extern int zfs_share_nfs(zfs_handle_t *); extern int zfs_share_smb(zfs_handle_t *); extern int zfs_shareall(zfs_handle_t *); extern int zfs_unshare_nfs(zfs_handle_t *, const char *); extern int zfs_unshare_smb(zfs_handle_t *, const char *); extern int zfs_unshareall_nfs(zfs_handle_t *); extern int zfs_unshareall_smb(zfs_handle_t *); extern int zfs_unshareall_bypath(zfs_handle_t *, const char *); extern int zfs_unshareall_bytype(zfs_handle_t *, const char *, const char *); extern int zfs_unshareall(zfs_handle_t *); extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *, void *, void *, int, zfs_share_op_t); extern void zfs_commit_nfs_shares(void); extern void zfs_commit_smb_shares(void); extern void zfs_commit_all_shares(void); extern void zfs_commit_shares(const char *); extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *); /* * Utility functions to run an external process. */ #define STDOUT_VERBOSE 0x01 #define STDERR_VERBOSE 0x02 #define NO_DEFAULT_PATH 0x04 /* Don't use $PATH to lookup the command */ int libzfs_run_process(const char *, char **, int); int libzfs_run_process_get_stdout(const char *, char *[], char *[], char **[], int *); int libzfs_run_process_get_stdout_nopath(const char *, char *[], char *[], char **[], int *); void libzfs_free_str_array(char **, int); int libzfs_envvar_is_set(char *); /* * Utility functions for zfs version */ extern void zfs_version_userland(char *, int); extern int zfs_version_kernel(char *, int); extern int zfs_version_print(void); /* * Given a device or file, determine if it is part of a pool. */ extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **, boolean_t *); /* * Label manipulation. */ extern int zpool_clear_label(int); extern int zpool_set_bootenv(zpool_handle_t *, const char *); extern int zpool_get_bootenv(zpool_handle_t *, char *, size_t, off_t); /* * Management interfaces for SMB ACL files */ int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *); int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *); int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *); int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *); /* * Enable and disable datasets within a pool by mounting/unmounting and * sharing/unsharing them. */ extern int zpool_enable_datasets(zpool_handle_t *, const char *, int); extern int zpool_disable_datasets(zpool_handle_t *, boolean_t); #ifdef __FreeBSD__ /* * Attach/detach the given filesystem to/from the given jail. */ extern int zfs_jail(zfs_handle_t *zhp, int jailid, int attach); /* * Set loader options for next boot. */ extern int zpool_nextboot(libzfs_handle_t *, uint64_t, uint64_t, const char *); #endif /* __FreeBSD__ */ #ifdef __cplusplus } #endif #endif /* _LIBZFS_H */ Index: vendor-sys/openzfs/dist/include/libzfs_impl.h =================================================================== --- vendor-sys/openzfs/dist/include/libzfs_impl.h (revision 365339) +++ vendor-sys/openzfs/dist/include/libzfs_impl.h (revision 365340) @@ -1,262 +1,266 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2018 Datto Inc. * Copyright 2020 Joyent, Inc. */ #ifndef _LIBZFS_IMPL_H #define _LIBZFS_IMPL_H #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif struct libzfs_handle { int libzfs_error; int libzfs_fd; FILE *libzfs_mnttab; zpool_handle_t *libzfs_pool_handles; uu_avl_pool_t *libzfs_ns_avlpool; uu_avl_t *libzfs_ns_avl; uint64_t libzfs_ns_gen; int libzfs_desc_active; char libzfs_action[1024]; char libzfs_desc[1024]; int libzfs_printerr; int libzfs_storeerr; /* stuff error messages into buffer */ boolean_t libzfs_mnttab_enable; /* * We need a lock to handle the case where parallel mount * threads are populating the mnttab cache simultaneously. The * lock only protects the integrity of the avl tree, and does * not protect the contents of the mnttab entries themselves. */ pthread_mutex_t libzfs_mnttab_cache_lock; avl_tree_t libzfs_mnttab_cache; int libzfs_pool_iter; char libzfs_chassis_id[256]; boolean_t libzfs_prop_debug; regex_t libzfs_urire; uint64_t libzfs_max_nvlist; }; struct zfs_handle { libzfs_handle_t *zfs_hdl; zpool_handle_t *zpool_hdl; char zfs_name[ZFS_MAX_DATASET_NAME_LEN]; zfs_type_t zfs_type; /* type including snapshot */ zfs_type_t zfs_head_type; /* type excluding snapshot */ dmu_objset_stats_t zfs_dmustats; nvlist_t *zfs_props; nvlist_t *zfs_user_props; nvlist_t *zfs_recvd_props; boolean_t zfs_mntcheck; char *zfs_mntopts; uint8_t *zfs_props_table; }; /* * This is different from checking zfs_type, because it will also catch * snapshots of volumes. */ #define ZFS_IS_VOLUME(zhp) ((zhp)->zfs_head_type == ZFS_TYPE_VOLUME) struct zpool_handle { libzfs_handle_t *zpool_hdl; zpool_handle_t *zpool_next; char zpool_name[ZFS_MAX_DATASET_NAME_LEN]; int zpool_state; size_t zpool_config_size; nvlist_t *zpool_config; nvlist_t *zpool_old_config; nvlist_t *zpool_props; diskaddr_t zpool_start_block; }; typedef enum { PROTO_NFS = 0, PROTO_SMB = 1, PROTO_END = 2 } zfs_share_proto_t; /* * The following can be used as a bitmask and any new values * added must preserve that capability. */ typedef enum { SHARED_NOT_SHARED = 0x0, SHARED_NFS = 0x2, SHARED_SMB = 0x4 } zfs_share_type_t; typedef int (*zfs_uri_handler_fn_t)(struct libzfs_handle *, const char *, const char *, zfs_keyformat_t, boolean_t, uint8_t **, size_t *); typedef struct zfs_uri_handler { const char *zuh_scheme; zfs_uri_handler_fn_t zuh_handler; } zfs_uri_handler_t; #define CONFIG_BUF_MINSIZE 262144 int zfs_error(libzfs_handle_t *, int, const char *); int zfs_error_fmt(libzfs_handle_t *, int, const char *, ...); void zfs_error_aux(libzfs_handle_t *, const char *, ...); void *zfs_alloc(libzfs_handle_t *, size_t); void *zfs_realloc(libzfs_handle_t *, void *, size_t, size_t); char *zfs_asprintf(libzfs_handle_t *, const char *, ...); char *zfs_strdup(libzfs_handle_t *, const char *); int no_memory(libzfs_handle_t *); int zfs_standard_error(libzfs_handle_t *, int, const char *); int zfs_standard_error_fmt(libzfs_handle_t *, int, const char *, ...); void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); int zpool_standard_error(libzfs_handle_t *, int, const char *); int zpool_standard_error_fmt(libzfs_handle_t *, int, const char *, ...); zfs_handle_t *make_dataset_handle_zc(libzfs_handle_t *, zfs_cmd_t *); zfs_handle_t *make_dataset_simple_handle_zc(zfs_handle_t *, zfs_cmd_t *); int zprop_parse_value(libzfs_handle_t *, nvpair_t *, int, zfs_type_t, nvlist_t *, char **, uint64_t *, const char *); int zprop_expand_list(libzfs_handle_t *hdl, zprop_list_t **plp, zfs_type_t type); /* * Use this changelist_gather() flag to force attempting mounts * on each change node regardless of whether or not it is currently * mounted. */ #define CL_GATHER_MOUNT_ALWAYS 1 /* * changelist_gather() flag to force it to iterate on mounted datasets only */ #define CL_GATHER_ITER_MOUNTED 2 +/* + * Use this changelist_gather() flag to prevent unmounting of file systems. + */ +#define CL_GATHER_DONT_UNMOUNT 4 typedef struct prop_changelist prop_changelist_t; int zcmd_alloc_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *, size_t); int zcmd_write_src_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t *); int zcmd_write_conf_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t *); int zcmd_expand_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *); int zcmd_read_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t **); void zcmd_free_nvlists(zfs_cmd_t *); int changelist_prefix(prop_changelist_t *); int changelist_postfix(prop_changelist_t *); void changelist_rename(prop_changelist_t *, const char *, const char *); void changelist_remove(prop_changelist_t *, const char *); void changelist_free(prop_changelist_t *); prop_changelist_t *changelist_gather(zfs_handle_t *, zfs_prop_t, int, int); int changelist_unshare(prop_changelist_t *, zfs_share_proto_t *); int changelist_haszonedchild(prop_changelist_t *); void remove_mountpoint(zfs_handle_t *); int create_parents(libzfs_handle_t *, char *, int); boolean_t isa_child_of(const char *dataset, const char *parent); zfs_handle_t *make_dataset_handle(libzfs_handle_t *, const char *); zfs_handle_t *make_bookmark_handle(zfs_handle_t *, const char *, nvlist_t *props); int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **); boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *); int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, boolean_t modifying); void namespace_clear(libzfs_handle_t *); extern int zfs_parse_options(char *, zfs_share_proto_t); extern int zfs_unshare_proto(zfs_handle_t *, const char *, zfs_share_proto_t *); typedef struct { zfs_prop_t p_prop; char *p_name; int p_share_err; int p_unshare_err; } proto_table_t; typedef struct differ_info { zfs_handle_t *zhp; char *fromsnap; char *frommnt; char *tosnap; char *tomnt; char *ds; char *dsmnt; char *tmpsnap; char errbuf[1024]; boolean_t isclone; boolean_t scripted; boolean_t classify; boolean_t timestamped; uint64_t shares; int zerr; int cleanupfd; int outputfd; int datafd; } differ_info_t; extern proto_table_t proto_table[PROTO_END]; extern int do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts, int flags); extern int do_unmount(const char *mntpt, int flags); extern int zfs_mount_delegation_check(void); extern int zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto); extern int unshare_one(libzfs_handle_t *hdl, const char *name, const char *mountpoint, zfs_share_proto_t proto); extern boolean_t zfs_is_mountable(zfs_handle_t *zhp, char *buf, size_t buflen, zprop_source_t *source, int flags); extern zfs_share_type_t is_shared(const char *mountpoint, zfs_share_proto_t proto); extern int libzfs_load_module(void); extern int zpool_relabel_disk(libzfs_handle_t *hdl, const char *path, const char *msg); extern int find_shares_object(differ_info_t *di); extern void libzfs_set_pipe_max(int infd); extern void zfs_commit_proto(zfs_share_proto_t *); #ifdef __cplusplus } #endif #endif /* _LIBZFS_IMPL_H */ Index: vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/cred.h =================================================================== --- vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/cred.h (revision 365339) +++ vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/cred.h (revision 365340) @@ -1,188 +1,188 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * Portions of this source code were derived from Berkeley 4.3 BSD * under license from the Regents of the University of California. */ #ifndef _SYS_CRED_H #define _SYS_CRED_H #include #ifdef __cplusplus extern "C" { #endif /* * The credential is an opaque kernel private data structure defined in * . */ typedef struct ucred cred_t; #define CRED() curthread->td_ucred #define kcred (thread0.td_ucred) #define KUID_TO_SUID(x) (x) #define KGID_TO_SGID(x) (x) #define crgetuid(cred) ((cred)->cr_uid) #define crgetruid(cred) ((cred)->cr_ruid) #define crgetgid(cred) ((cred)->cr_gid) #define crgetgroups(cred) ((cred)->cr_groups) #define crgetngroups(cred) ((cred)->cr_ngroups) #define crgetsid(cred, i) (NULL) struct proc; /* cred.h is included in proc.h */ struct prcred; struct ksid; struct ksidlist; struct credklpd; struct credgrp; struct auditinfo_addr; /* cred.h is included in audit.h */ extern int ngroups_max; /* * kcred is used when you need all privileges. */ extern void cred_init(void); extern void crfree(cred_t *); extern cred_t *cralloc(void); /* all but ref uninitialized */ extern cred_t *cralloc_ksid(void); /* cralloc() + ksid alloc'ed */ extern cred_t *crget(void); /* initialized */ extern void crcopy_to(cred_t *, cred_t *); extern cred_t *crdup(cred_t *); extern void crdup_to(cred_t *, cred_t *); extern cred_t *crgetcred(void); extern void crset(struct proc *, cred_t *); extern void crset_zone_privall(cred_t *); extern int supgroupmember(gid_t, const cred_t *); extern int hasprocperm(const cred_t *, const cred_t *); extern int prochasprocperm(struct proc *, struct proc *, const cred_t *); extern int crcmp(const cred_t *, const cred_t *); extern cred_t *zone_kcred(void); extern gid_t crgetrgid(const cred_t *); extern gid_t crgetsgid(const cred_t *); -#define crgetzoneid(x) (0) +#define crgetzoneid(cr) ((cr)->cr_prison->pr_id) extern projid_t crgetprojid(const cred_t *); extern cred_t *crgetmapped(const cred_t *); extern const struct auditinfo_addr *crgetauinfo(const cred_t *); extern struct auditinfo_addr *crgetauinfo_modifiable(cred_t *); extern uint_t crgetref(const cred_t *); extern const gid_t *crgetggroups(const struct credgrp *); /* * Sets real, effective and/or saved uid/gid; * -1 argument accepted as "no change". */ extern int crsetresuid(cred_t *, uid_t, uid_t, uid_t); extern int crsetresgid(cred_t *, gid_t, gid_t, gid_t); /* * Sets real, effective and saved uids/gids all to the same * values. Both values must be non-negative and <= MAXUID */ extern int crsetugid(cred_t *, uid_t, gid_t); /* * Functions to handle the supplemental group list. */ extern struct credgrp *crgrpcopyin(int, gid_t *); extern void crgrprele(struct credgrp *); extern void crsetcredgrp(cred_t *, struct credgrp *); /* * Private interface for setting zone association of credential. */ struct zone; extern void crsetzone(cred_t *, struct zone *); extern struct zone *crgetzone(const cred_t *); /* * Private interface for setting project id in credential. */ extern void crsetprojid(cred_t *, projid_t); /* * Private interface for nfs. */ extern cred_t *crnetadjust(cred_t *); /* * Private interface for procfs. */ extern void cred2prcred(const cred_t *, struct prcred *); /* * Private interfaces for Rampart Trusted Solaris. */ struct ts_label_s; extern struct ts_label_s *crgetlabel(const cred_t *); extern boolean_t crisremote(const cred_t *); /* * Private interfaces for ephemeral uids. */ #define VALID_UID(id, zn) \ ((id) <= MAXUID || valid_ephemeral_uid((zn), (id))) #define VALID_GID(id, zn) \ ((id) <= MAXUID || valid_ephemeral_gid((zn), (id))) extern boolean_t valid_ephemeral_uid(struct zone *, uid_t); extern boolean_t valid_ephemeral_gid(struct zone *, gid_t); extern int eph_uid_alloc(struct zone *, int, uid_t *, int); extern int eph_gid_alloc(struct zone *, int, gid_t *, int); extern void crsetsid(cred_t *, struct ksid *, int); extern void crsetsidlist(cred_t *, struct ksidlist *); extern struct ksidlist *crgetsidlist(const cred_t *); extern int crsetpriv(cred_t *, ...); extern struct credklpd *crgetcrklpd(const cred_t *); extern void crsetcrklpd(cred_t *, struct credklpd *); #ifdef __cplusplus } #endif #endif /* _SYS_CRED_H */ Index: vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/zone.h =================================================================== --- vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/zone.h (revision 365339) +++ vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/zone.h (revision 365340) @@ -1,68 +1,67 @@ /* * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _OPENSOLARIS_SYS_ZONE_H_ #define _OPENSOLARIS_SYS_ZONE_H_ +#include + /* * Macros to help with zone visibility restrictions. */ #define GLOBAL_ZONEID 0 /* - * Is thread in the global zone? + * Is proc in the global zone? */ -#define INGLOBALZONE(p) in_globalzone((p)) - - -extern boolean_t in_globalzone(struct proc *); +#define INGLOBALZONE(proc) (!jailed((proc)->p_ucred)) /* * Attach the given dataset to the given jail. */ extern int zone_dataset_attach(struct ucred *, const char *, int); /* * Detach the given dataset to the given jail. */ extern int zone_dataset_detach(struct ucred *, const char *, int); /* * Returns true if the named pool/dataset is visible in the current zone. */ extern int zone_dataset_visible(const char *, int *); /* * Safely get the hostid of the specified zone (defaults to machine's hostid * if the specified zone doesn't emulate a hostid). Passing NULL retrieves * the global zone's (i.e., physical system's) hostid. */ extern uint32_t zone_get_hostid(void *); #endif /* !_OPENSOLARIS_SYS_ZONE_H_ */ Index: vendor-sys/openzfs/dist/include/os/freebsd/zfs/sys/Makefile.am =================================================================== --- vendor-sys/openzfs/dist/include/os/freebsd/zfs/sys/Makefile.am (revision 365339) +++ vendor-sys/openzfs/dist/include/os/freebsd/zfs/sys/Makefile.am (revision 365340) @@ -1,14 +1,14 @@ KERNEL_H = \ freebsd_crypto.h \ sha2.h \ vdev_os.h \ zfs_context_os.h \ zfs_ctldir.h \ zfs_dir.h \ zfs_ioctl_compat.h \ - zfs_vfsops.h \ + zfs_vfsops_os.h \ zfs_vnops.h \ zfs_znode_impl.h \ zpl.h noinst_HEADERS = $(KERNEL_H) Index: vendor-sys/openzfs/dist/include/os/freebsd/zfs/sys/zfs_vfsops.h =================================================================== --- vendor-sys/openzfs/dist/include/os/freebsd/zfs/sys/zfs_vfsops.h (revision 365339) +++ vendor-sys/openzfs/dist/include/os/freebsd/zfs/sys/zfs_vfsops.h (revision 365340) @@ -1,177 +1,176 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. */ #ifndef _SYS_FS_ZFS_VFSOPS_H #define _SYS_FS_ZFS_VFSOPS_H #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif typedef struct zfsvfs zfsvfs_t; struct znode; struct zfsvfs { vfs_t *z_vfs; /* generic fs struct */ zfsvfs_t *z_parent; /* parent fs */ objset_t *z_os; /* objset reference */ uint64_t z_flags; /* super_block flags */ uint64_t z_root; /* id of root znode */ uint64_t z_unlinkedobj; /* id of unlinked zapobj */ uint64_t z_max_blksz; /* maximum block size for files */ uint64_t z_fuid_obj; /* fuid table object number */ uint64_t z_fuid_size; /* fuid table size */ avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ krwlock_t z_fuid_lock; /* fuid lock */ boolean_t z_fuid_loaded; /* fuid tables are loaded */ boolean_t z_fuid_dirty; /* need to sync fuid table ? */ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ zilog_t *z_log; /* intent log pointer */ uint_t z_acl_mode; /* acl chmod/mode behavior */ uint_t z_acl_inherit; /* acl inheritance behavior */ zfs_case_t z_case; /* case-sense */ boolean_t z_utf8; /* utf8-only */ int z_norm; /* normalization flags */ boolean_t z_atime; /* enable atimes mount option */ boolean_t z_unmounted; /* unmounted */ rrmlock_t z_teardown_lock; krwlock_t z_teardown_inactive_lock; list_t z_all_znodes; /* all vnodes in the fs */ uint64_t z_nr_znodes; /* number of znodes in the fs */ kmutex_t z_znodes_lock; /* lock for z_all_znodes */ struct zfsctl_root *z_ctldir; /* .zfs directory pointer */ boolean_t z_show_ctldir; /* expose .zfs in the root dir */ boolean_t z_issnap; /* true if this is a snapshot */ boolean_t z_vscan; /* virus scan on/off */ boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_replay; /* set during ZIL replay */ boolean_t z_use_sa; /* version allow system attributes */ boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */ boolean_t z_use_namecache; /* make use of FreeBSD name cache */ uint8_t z_xattr; /* xattr type in use */ uint64_t z_version; /* ZPL version */ uint64_t z_shares_dir; /* hidden shares dir */ dataset_kstats_t z_kstat; /* fs kstats */ kmutex_t z_lock; uint64_t z_userquota_obj; uint64_t z_groupquota_obj; uint64_t z_userobjquota_obj; uint64_t z_groupobjquota_obj; uint64_t z_projectquota_obj; uint64_t z_projectobjquota_obj; uint64_t z_replay_eof; /* New end of file - replay only */ sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ #define ZFS_OBJ_MTX_SZ 64 kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ struct task z_unlinked_drain_task; }; #define ZSB_XATTR 0x0001 /* Enable user xattrs */ /* * Normal filesystems (those not under .zfs/snapshot) have a total * file ID size limited to 12 bytes (including the length field) due to * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical * reasons, this same limit is being imposed by the Solaris NFSv3 implementation * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It * is not possible to expand beyond 12 bytes without abandoning support * of NFSv2. * * For normal filesystems, we partition up the available space as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) * * We reserve only 48 bits for the object number, as this is the limit * currently defined and imposed by the DMU. */ typedef struct zfid_short { uint16_t zf_len; uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ } zfid_short_t; /* * Filesystems under .zfs/snapshot have a total file ID size of 22[*] bytes * (including the length field). This makes files under .zfs/snapshot * accessible by NFSv3 and NFSv4, but not NFSv2. * * For files under .zfs/snapshot, we partition up the available space * as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) * 6 bytes objset id (48 bits) * 4 bytes[**] currently just zero (32 bits) * * We reserve only 48 bits for the object number and objset id, as these are * the limits currently defined and imposed by the DMU. * * [*] 20 bytes on FreeBSD to fit into the size of struct fid. * [**] 2 bytes on FreeBSD for the above reason. */ typedef struct zfid_long { zfid_short_t z_fid; uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ uint8_t zf_setgen[2]; /* gen[i] = gen >> (8 * i) */ } zfid_long_t; #define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) #define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) extern uint_t zfs_fsyncer_key; extern int zfs_super_owner; extern void zfs_init(void); extern void zfs_fini(void); extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); extern int zfs_end_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); extern int zfsvfs_create(const char *name, boolean_t readonly, zfsvfs_t **zfvp); extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); extern void zfsvfs_free(zfsvfs_t *zfsvfs); extern int zfs_check_global_label(const char *dsname, const char *hexsl); extern boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs); extern int zfs_get_temporary_prop(struct dsl_dataset *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint); extern int zfs_busy(void); -extern void zfsvfs_update_fromname(const char *oldname, const char *newname); #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_VFSOPS_H */ Index: vendor-sys/openzfs/dist/include/os/linux/kernel/linux/page_compat.h =================================================================== --- vendor-sys/openzfs/dist/include/os/linux/kernel/linux/page_compat.h (revision 365339) +++ vendor-sys/openzfs/dist/include/os/linux/kernel/linux/page_compat.h (revision 365340) @@ -1,92 +1,67 @@ #ifndef _ZFS_PAGE_COMPAT_H #define _ZFS_PAGE_COMPAT_H /* * We have various enum members moving between two separate enum types, * and accessed by different functions at various times. Centralise the * insanity. * * < v4.8: all enums in zone_stat_item, via global_page_state() * v4.8: some enums moved to node_stat_item, global_node_page_state() introduced * v4.13: some enums moved from zone_stat_item to node_state_item * v4.14: global_page_state() rename to global_zone_page_state() * * The defines used here are created by config/kernel-global_page_state.m4 */ /* * Create our own accessor functions to follow the Linux API changes */ #if defined(ZFS_GLOBAL_ZONE_PAGE_STATE) /* global_zone_page_state() introduced */ #if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_FILE_PAGES) #define nr_file_pages() global_node_page_state(NR_FILE_PAGES) #else #define nr_file_pages() global_zone_page_state(NR_FILE_PAGES) #endif #if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_ANON) #define nr_inactive_anon_pages() global_node_page_state(NR_INACTIVE_ANON) #else #define nr_inactive_anon_pages() global_zone_page_state(NR_INACTIVE_ANON) #endif #if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_FILE) #define nr_inactive_file_pages() global_node_page_state(NR_INACTIVE_FILE) #else #define nr_inactive_file_pages() global_zone_page_state(NR_INACTIVE_FILE) #endif -#if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE_B) -#define nr_slab_reclaimable_pages() \ - global_node_page_state(NR_SLAB_RECLAIMABLE_B) -#else -#if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE) -#define nr_slab_reclaimable_pages() global_node_page_state(NR_SLAB_RECLAIMABLE) -#else -#define nr_slab_reclaimable_pages() global_zone_page_state(NR_SLAB_RECLAIMABLE) -#endif -#endif #elif defined(ZFS_GLOBAL_NODE_PAGE_STATE) /* global_node_page_state() introduced */ #if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_FILE_PAGES) #define nr_file_pages() global_node_page_state(NR_FILE_PAGES) #else #define nr_file_pages() global_page_state(NR_FILE_PAGES) #endif #if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_ANON) #define nr_inactive_anon_pages() global_node_page_state(NR_INACTIVE_ANON) #else #define nr_inactive_anon_pages() global_page_state(NR_INACTIVE_ANON) #endif #if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_FILE) #define nr_inactive_file_pages() global_node_page_state(NR_INACTIVE_FILE) #else #define nr_inactive_file_pages() global_page_state(NR_INACTIVE_FILE) #endif -#if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE_B) -#define nr_slab_reclaimable_pages() \ - global_node_page_state(NR_SLAB_RECLAIMABLE_B) -#else -#if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE) -#define nr_slab_reclaimable_pages() global_node_page_state(NR_SLAB_RECLAIMABLE) -#else -#define nr_slab_reclaimable_pages() global_page_state(NR_SLAB_RECLAIMABLE) -#endif -#endif #else /* global_page_state() only */ #define nr_file_pages() global_page_state(NR_FILE_PAGES) #define nr_inactive_anon_pages() global_page_state(NR_INACTIVE_ANON) #define nr_inactive_file_pages() global_page_state(NR_INACTIVE_FILE) -#ifdef ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE_B -#define nr_slab_reclaimable_pages() global_page_state(NR_SLAB_RECLAIMABLE_B) -#else -#define nr_slab_reclaimable_pages() global_page_state(NR_SLAB_RECLAIMABLE) -#endif /* ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE_B */ #endif /* ZFS_GLOBAL_ZONE_PAGE_STATE */ #endif /* _ZFS_PAGE_COMPAT_H */ Index: vendor-sys/openzfs/dist/include/os/linux/spl/sys/vmsystm.h =================================================================== --- vendor-sys/openzfs/dist/include/os/linux/spl/sys/vmsystm.h (revision 365339) +++ vendor-sys/openzfs/dist/include/os/linux/spl/sys/vmsystm.h (revision 365340) @@ -1,103 +1,92 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #ifndef _SPL_VMSYSTM_H #define _SPL_VMSYSTM_H #include #include #include #include #include #include #include #ifdef HAVE_TOTALRAM_PAGES_FUNC #define zfs_totalram_pages totalram_pages() #else #define zfs_totalram_pages totalram_pages #endif #ifdef HAVE_TOTALHIGH_PAGES #define zfs_totalhigh_pages totalhigh_pages() #else #define zfs_totalhigh_pages totalhigh_pages #endif #define membar_producer() smp_wmb() #define physmem zfs_totalram_pages -#ifdef ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE_B -#define freemem (nr_free_pages() + \ - global_page_state(NR_INACTIVE_FILE) + \ - global_page_state(NR_INACTIVE_ANON) + \ - global_page_state(NR_SLAB_RECLAIMABLE_B)) -#else -#define freemem (nr_free_pages() + \ - global_page_state(NR_INACTIVE_FILE) + \ - global_page_state(NR_INACTIVE_ANON) + \ - global_page_state(NR_SLAB_RECLAIMABLE)) -#endif /* ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE_B */ #define xcopyin(from, to, size) copy_from_user(to, from, size) #define xcopyout(from, to, size) copy_to_user(to, from, size) static __inline__ int copyin(const void *from, void *to, size_t len) { /* On error copyin routine returns -1 */ if (xcopyin(from, to, len)) return (-1); return (0); } static __inline__ int copyout(const void *from, void *to, size_t len) { /* On error copyout routine returns -1 */ if (xcopyout(from, to, len)) return (-1); return (0); } static __inline__ int copyinstr(const void *from, void *to, size_t len, size_t *done) { size_t rc; if (len == 0) return (-ENAMETOOLONG); /* XXX: Should return ENAMETOOLONG if 'strlen(from) > len' */ memset(to, 0, len); rc = copyin(from, to, len - 1); if (done != NULL) *done = rc; return (0); } #endif /* SPL_VMSYSTM_H */ Index: vendor-sys/openzfs/dist/include/os/linux/zfs/sys/Makefile.am =================================================================== --- vendor-sys/openzfs/dist/include/os/linux/zfs/sys/Makefile.am (revision 365339) +++ vendor-sys/openzfs/dist/include/os/linux/zfs/sys/Makefile.am (revision 365340) @@ -1,30 +1,30 @@ KERNEL_H = \ policy.h \ sha2.h \ trace_acl.h \ trace_arc.h \ trace_common.h \ trace_zfs.h \ trace_dbgmsg.h \ trace_dbuf.h \ trace_dmu.h \ trace_dnode.h \ trace_multilist.h \ trace_rrwlock.h \ trace_txg.h \ trace_vdev.h \ trace_zil.h \ trace_zio.h \ trace_zrlock.h \ zfs_context_os.h \ zfs_ctldir.h \ zfs_dir.h \ - zfs_vfsops.h \ + zfs_vfsops_os.h \ zfs_vnops.h \ zfs_znode_impl.h \ zpl.h if CONFIG_KERNEL kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys kernel_HEADERS = $(KERNEL_H) endif Index: vendor-sys/openzfs/dist/include/sys/Makefile.am =================================================================== --- vendor-sys/openzfs/dist/include/sys/Makefile.am (revision 365339) +++ vendor-sys/openzfs/dist/include/sys/Makefile.am (revision 365340) @@ -1,147 +1,148 @@ SUBDIRS = fm fs crypto lua sysevent zstd COMMON_H = \ abd.h \ abd_impl.h \ aggsum.h \ arc.h \ arc_impl.h \ avl.h \ avl_impl.h \ bitops.h \ blkptr.h \ bplist.h \ bpobj.h \ bptree.h \ btree.h \ bqueue.h \ dataset_kstats.h \ dbuf.h \ ddt.h \ dmu.h \ dmu_impl.h \ dmu_objset.h \ dmu_recv.h \ dmu_redact.h \ dmu_send.h \ dmu_traverse.h \ dmu_tx.h \ dmu_zfetch.h \ dnode.h \ dsl_bookmark.h \ dsl_dataset.h \ dsl_deadlist.h \ dsl_deleg.h \ dsl_destroy.h \ dsl_dir.h \ dsl_crypt.h \ dsl_pool.h \ dsl_prop.h \ dsl_scan.h \ dsl_synctask.h \ dsl_userhold.h \ edonr.h \ efi_partition.h \ frame.h \ hkdf.h \ metaslab.h \ metaslab_impl.h \ mmp.h \ mntent.h \ mod.h \ multilist.h \ note.h \ nvpair.h \ nvpair_impl.h \ objlist.h \ pathname.h \ qat.h \ range_tree.h \ rrwlock.h \ sa.h \ sa_impl.h \ skein.h \ spa_boot.h \ spa_checkpoint.h \ spa_log_spacemap.h \ space_map.h \ space_reftree.h \ spa.h \ spa_impl.h \ spa_checksum.h \ sysevent.h \ txg.h \ txg_impl.h \ u8_textprep_data.h \ u8_textprep.h \ uberblock.h \ uberblock_impl.h \ uio_impl.h \ unique.h \ uuid.h \ vdev_disk.h \ vdev_file.h \ vdev.h \ vdev_impl.h \ vdev_indirect_births.h \ vdev_indirect_mapping.h \ vdev_initialize.h \ vdev_raidz.h \ vdev_raidz_impl.h \ vdev_rebuild.h \ vdev_removal.h \ vdev_trim.h \ xvattr.h \ zap.h \ zap_impl.h \ zap_leaf.h \ zcp.h \ zcp_global.h \ zcp_iter.h \ zcp_prop.h \ zcp_set.h \ zfeature.h \ zfs_acl.h \ zfs_context.h \ zfs_debug.h \ zfs_delay.h \ zfs_file.h \ zfs_fuid.h \ zfs_project.h \ zfs_quota.h \ zfs_ratelimit.h \ zfs_refcount.h \ zfs_rlock.h \ zfs_sa.h \ zfs_stat.h \ zfs_sysfs.h \ + zfs_vfsops.h \ zfs_znode.h \ zil.h \ zil_impl.h \ zio_checksum.h \ zio_compress.h \ zio_crypt.h \ zio.h \ zio_impl.h \ zio_priority.h \ zrlock.h \ zthr.h KERNEL_H = \ zfs_ioctl.h \ zfs_ioctl_impl.h \ zfs_onexit.h \ zvol.h \ zvol_impl.h if CONFIG_USER libzfsdir = $(includedir)/libzfs/sys libzfs_HEADERS = $(COMMON_H) endif if CONFIG_KERNEL if BUILD_LINUX kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys kernel_HEADERS = $(COMMON_H) $(KERNEL_H) endif endif Index: vendor-sys/openzfs/dist/include/sys/zfs_vfsops.h =================================================================== --- vendor-sys/openzfs/dist/include/sys/zfs_vfsops.h (nonexistent) +++ vendor-sys/openzfs/dist/include/sys/zfs_vfsops.h (revision 365340) @@ -0,0 +1,35 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Portions Copyright 2020 iXsystems, Inc. + */ + +#ifndef _SYS_ZFS_VFSOPS_H +#define _SYS_ZFS_VFSOPS_H + +#ifdef _KERNEL +#include +#endif + +extern void zfsvfs_update_fromname(const char *, const char *); + +#endif /* _SYS_ZFS_VFSOPS_H */ Property changes on: vendor-sys/openzfs/dist/include/sys/zfs_vfsops.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: vendor-sys/openzfs/dist/lib/libspl/Makefile.am =================================================================== --- vendor-sys/openzfs/dist/lib/libspl/Makefile.am (revision 365339) +++ vendor-sys/openzfs/dist/lib/libspl/Makefile.am (revision 365340) @@ -1,56 +1,57 @@ include $(top_srcdir)/config/Rules.am if TARGET_CPU_I386 TARGET_CPU_ATOMIC_SOURCE = asm-i386/atomic.S else if TARGET_CPU_X86_64 TARGET_CPU_ATOMIC_SOURCE = asm-x86_64/atomic.S else TARGET_CPU_ATOMIC_SOURCE = asm-generic/atomic.c endif endif SUBDIRS = include AM_CCASFLAGS = \ $(CFLAGS) noinst_LTLIBRARIES = libspl_assert.la libspl.la libspl_assert_la_SOURCES = \ assert.c USER_C = \ list.c \ mkdirp.c \ page.c \ strlcat.c \ strlcpy.c \ timestamp.c \ - zone.c \ include/sys/list.h \ include/sys/list_impl.h if BUILD_LINUX USER_C += \ os/linux/getexecname.c \ os/linux/gethostid.c \ - os/linux/getmntany.c + os/linux/getmntany.c \ + os/linux/zone.c endif if BUILD_FREEBSD USER_C += \ os/freebsd/getexecname.c \ os/freebsd/gethostid.c \ os/freebsd/getmntany.c \ - os/freebsd/mnttab.c + os/freebsd/mnttab.c \ + os/freebsd/zone.c endif libspl_la_SOURCES = \ $(USER_C) \ $(TARGET_CPU_ATOMIC_SOURCE) libspl_la_LIBADD = \ libspl_assert.la libspl_la_LIBADD += $(LIBCLOCK_GETTIME) Index: vendor-sys/openzfs/dist/lib/libspl/include/limits.h =================================================================== --- vendor-sys/openzfs/dist/lib/libspl/include/limits.h (revision 365339) +++ vendor-sys/openzfs/dist/lib/libspl/include/limits.h (revision 365340) @@ -1,40 +1,45 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include_next +#include #ifndef _LIBSPL_LIMITS_H #define _LIBSPL_LIMITS_H +#ifndef DBL_DIG #define DBL_DIG 15 #define DBL_MAX 1.7976931348623157081452E+308 #define DBL_MIN 2.2250738585072013830903E-308 +#endif +#ifndef FLT_DIG #define FLT_DIG 6 #define FLT_MAX 3.4028234663852885981170E+38F #define FLT_MIN 1.1754943508222875079688E-38F +#endif #endif /* _LIBSPL_LIMITS_H */ Index: vendor-sys/openzfs/dist/lib/libspl/include/zone.h =================================================================== --- vendor-sys/openzfs/dist/lib/libspl/include/zone.h (revision 365339) +++ vendor-sys/openzfs/dist/lib/libspl/include/zone.h (revision 365340) @@ -1,53 +1,44 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _LIBSPL_ZONE_H #define _LIBSPL_ZONE_H - - #include #include -#include #ifdef __cplusplus extern "C" { #endif #define GLOBAL_ZONEID 0 -#define GLOBAL_ZONEID_NAME "global" -/* - * Functions for mapping between id and name for active zones. - */ extern zoneid_t getzoneid(void); -extern zoneid_t getzoneidbyname(const char *); -extern ssize_t getzonenamebyid(zoneid_t, char *, size_t); #ifdef __cplusplus } #endif #endif /* _LIBSPL_ZONE_H */ Index: vendor-sys/openzfs/dist/lib/libspl/os/freebsd/zone.c =================================================================== --- vendor-sys/openzfs/dist/lib/libspl/os/freebsd/zone.c (nonexistent) +++ vendor-sys/openzfs/dist/lib/libspl/os/freebsd/zone.c (revision 365340) @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include + +zoneid_t +getzoneid(void) +{ + size_t size; + int jailid; + + /* Information that we are in jail or not is enough for our needs. */ + size = sizeof (jailid); + if (sysctlbyname("security.jail.jailed", &jailid, &size, NULL, 0) == -1) + assert(!"No security.jail.jailed sysctl!"); + return ((zoneid_t)jailid); +} Property changes on: vendor-sys/openzfs/dist/lib/libspl/os/freebsd/zone.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: vendor-sys/openzfs/dist/lib/libspl/zone.c =================================================================== --- vendor-sys/openzfs/dist/lib/libspl/zone.c (revision 365339) +++ vendor-sys/openzfs/dist/lib/libspl/zone.c (revision 365340) @@ -1,63 +1,32 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2006 Ricardo Correia. All rights reserved. * Use is subject to license terms. */ #include -#include -#include zoneid_t getzoneid() { return (GLOBAL_ZONEID); -} - -zoneid_t -getzoneidbyname(const char *name) -{ - if (name == NULL) - return (GLOBAL_ZONEID); - - if (strcmp(name, GLOBAL_ZONEID_NAME) == 0) - return (GLOBAL_ZONEID); - - return (EINVAL); -} - -ssize_t -getzonenamebyid(zoneid_t id, char *buf, size_t buflen) -{ - if (id != GLOBAL_ZONEID) - return (EINVAL); - - ssize_t ret = strlen(GLOBAL_ZONEID_NAME) + 1; - - if (buf == NULL || buflen == 0) - return (ret); - - strncpy(buf, GLOBAL_ZONEID_NAME, buflen); - buf[buflen - 1] = '\0'; - - return (ret); } Index: vendor-sys/openzfs/dist/lib/libzfs/libzfs_changelist.c =================================================================== --- vendor-sys/openzfs/dist/lib/libzfs/libzfs_changelist.c (revision 365339) +++ vendor-sys/openzfs/dist/lib/libzfs/libzfs_changelist.c (revision 365340) @@ -1,781 +1,785 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * Portions Copyright 2007 Ramprakash Jelari * Copyright (c) 2014, 2020 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2018 Datto Inc. */ #include #include #include #include #include #include #include #include #include "libzfs_impl.h" /* * Structure to keep track of dataset state. Before changing the 'sharenfs' or * 'mountpoint' property, we record whether the filesystem was previously * mounted/shared. This prior state dictates whether we remount/reshare the * dataset after the property has been changed. * * The interface consists of the following sequence of functions: * * changelist_gather() * changelist_prefix() * < change property > * changelist_postfix() * changelist_free() * * Other interfaces: * * changelist_remove() - remove a node from a gathered list * changelist_rename() - renames all datasets appropriately when doing a rename * changelist_unshare() - unshares all the nodes in a given changelist * changelist_haszonedchild() - check if there is any child exported to * a local zone */ typedef struct prop_changenode { zfs_handle_t *cn_handle; int cn_shared; int cn_mounted; int cn_zoned; boolean_t cn_needpost; /* is postfix() needed? */ uu_avl_node_t cn_treenode; } prop_changenode_t; struct prop_changelist { zfs_prop_t cl_prop; zfs_prop_t cl_realprop; zfs_prop_t cl_shareprop; /* used with sharenfs/sharesmb */ uu_avl_pool_t *cl_pool; uu_avl_t *cl_tree; boolean_t cl_waslegacy; boolean_t cl_allchildren; boolean_t cl_alldependents; int cl_mflags; /* Mount flags */ int cl_gflags; /* Gather request flags */ boolean_t cl_haszonedchild; }; /* * If the property is 'mountpoint', go through and unmount filesystems as * necessary. We don't do the same for 'sharenfs', because we can just re-share * with different options without interrupting service. We do handle 'sharesmb' * since there may be old resource names that need to be removed. */ int changelist_prefix(prop_changelist_t *clp) { prop_changenode_t *cn; uu_avl_walk_t *walk; int ret = 0; boolean_t commit_smb_shares = B_FALSE; if (clp->cl_prop != ZFS_PROP_MOUNTPOINT && clp->cl_prop != ZFS_PROP_SHARESMB) return (0); if ((walk = uu_avl_walk_start(clp->cl_tree, UU_WALK_ROBUST)) == NULL) return (-1); while ((cn = uu_avl_walk_next(walk)) != NULL) { /* if a previous loop failed, set the remaining to false */ if (ret == -1) { cn->cn_needpost = B_FALSE; continue; } /* * If we are in the global zone, but this dataset is exported * to a local zone, do nothing. */ if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned) continue; if (!ZFS_IS_VOLUME(cn->cn_handle)) { /* * Do the property specific processing. */ switch (clp->cl_prop) { case ZFS_PROP_MOUNTPOINT: + if (clp->cl_gflags & CL_GATHER_DONT_UNMOUNT) + break; if (zfs_unmount(cn->cn_handle, NULL, clp->cl_mflags) != 0) { ret = -1; cn->cn_needpost = B_FALSE; } break; case ZFS_PROP_SHARESMB: (void) zfs_unshare_smb(cn->cn_handle, NULL); commit_smb_shares = B_TRUE; break; default: break; } } } if (commit_smb_shares) zfs_commit_smb_shares(); uu_avl_walk_end(walk); if (ret == -1) (void) changelist_postfix(clp); return (ret); } /* * If the property is 'mountpoint' or 'sharenfs', go through and remount and/or * reshare the filesystems as necessary. In changelist_gather() we recorded * whether the filesystem was previously shared or mounted. The action we take * depends on the previous state, and whether the value was previously 'legacy'. * For non-legacy properties, we only remount/reshare the filesystem if it was * previously mounted/shared. Otherwise, we always remount/reshare the * filesystem. */ int changelist_postfix(prop_changelist_t *clp) { prop_changenode_t *cn; uu_avl_walk_t *walk; char shareopts[ZFS_MAXPROPLEN]; int errors = 0; boolean_t commit_smb_shares = B_FALSE; boolean_t commit_nfs_shares = B_FALSE; /* * If we're changing the mountpoint, attempt to destroy the underlying * mountpoint. All other datasets will have inherited from this dataset * (in which case their mountpoints exist in the filesystem in the new * location), or have explicit mountpoints set (in which case they won't * be in the changelist). */ if ((cn = uu_avl_last(clp->cl_tree)) == NULL) return (0); - if (clp->cl_prop == ZFS_PROP_MOUNTPOINT) + if (clp->cl_prop == ZFS_PROP_MOUNTPOINT && + !(clp->cl_gflags & CL_GATHER_DONT_UNMOUNT)) remove_mountpoint(cn->cn_handle); /* * We walk the datasets in reverse, because we want to mount any parent * datasets before mounting the children. We walk all datasets even if * there are errors. */ if ((walk = uu_avl_walk_start(clp->cl_tree, UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) return (-1); while ((cn = uu_avl_walk_next(walk)) != NULL) { boolean_t sharenfs; boolean_t sharesmb; boolean_t mounted; boolean_t needs_key; /* * If we are in the global zone, but this dataset is exported * to a local zone, do nothing. */ if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned) continue; /* Only do post-processing if it's required */ if (!cn->cn_needpost) continue; cn->cn_needpost = B_FALSE; zfs_refresh_properties(cn->cn_handle); if (ZFS_IS_VOLUME(cn->cn_handle)) continue; /* * Remount if previously mounted or mountpoint was legacy, * or sharenfs or sharesmb property is set. */ sharenfs = ((zfs_prop_get(cn->cn_handle, ZFS_PROP_SHARENFS, shareopts, sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0) && (strcmp(shareopts, "off") != 0)); sharesmb = ((zfs_prop_get(cn->cn_handle, ZFS_PROP_SHARESMB, shareopts, sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0) && (strcmp(shareopts, "off") != 0)); needs_key = (zfs_prop_get_int(cn->cn_handle, ZFS_PROP_KEYSTATUS) == ZFS_KEYSTATUS_UNAVAILABLE); - mounted = zfs_is_mounted(cn->cn_handle, NULL); + mounted = (clp->cl_gflags & CL_GATHER_DONT_UNMOUNT) || + zfs_is_mounted(cn->cn_handle, NULL); if (!mounted && !needs_key && (cn->cn_mounted || ((sharenfs || sharesmb || clp->cl_waslegacy) && (zfs_prop_get_int(cn->cn_handle, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON)))) { if (zfs_mount(cn->cn_handle, NULL, 0) != 0) errors++; else mounted = TRUE; } /* * If the file system is mounted we always re-share even * if the filesystem is currently shared, so that we can * adopt any new options. */ if (sharenfs && mounted) { errors += zfs_share_nfs(cn->cn_handle); commit_nfs_shares = B_TRUE; } else if (cn->cn_shared || clp->cl_waslegacy) { errors += zfs_unshare_nfs(cn->cn_handle, NULL); commit_nfs_shares = B_TRUE; } if (sharesmb && mounted) { errors += zfs_share_smb(cn->cn_handle); commit_smb_shares = B_TRUE; } else if (cn->cn_shared || clp->cl_waslegacy) { errors += zfs_unshare_smb(cn->cn_handle, NULL); commit_smb_shares = B_TRUE; } } if (commit_nfs_shares) zfs_commit_nfs_shares(); if (commit_smb_shares) zfs_commit_smb_shares(); uu_avl_walk_end(walk); return (errors ? -1 : 0); } /* * Is this "dataset" a child of "parent"? */ boolean_t isa_child_of(const char *dataset, const char *parent) { int len; len = strlen(parent); if (strncmp(dataset, parent, len) == 0 && (dataset[len] == '@' || dataset[len] == '/' || dataset[len] == '\0')) return (B_TRUE); else return (B_FALSE); } /* * If we rename a filesystem, child filesystem handles are no longer valid * since we identify each dataset by its name in the ZFS namespace. As a * result, we have to go through and fix up all the names appropriately. We * could do this automatically if libzfs kept track of all open handles, but * this is a lot less work. */ void changelist_rename(prop_changelist_t *clp, const char *src, const char *dst) { prop_changenode_t *cn; uu_avl_walk_t *walk; char newname[ZFS_MAX_DATASET_NAME_LEN]; if ((walk = uu_avl_walk_start(clp->cl_tree, UU_WALK_ROBUST)) == NULL) return; while ((cn = uu_avl_walk_next(walk)) != NULL) { /* * Do not rename a clone that's not in the source hierarchy. */ if (!isa_child_of(cn->cn_handle->zfs_name, src)) continue; /* * Destroy the previous mountpoint if needed. */ remove_mountpoint(cn->cn_handle); (void) strlcpy(newname, dst, sizeof (newname)); (void) strlcat(newname, cn->cn_handle->zfs_name + strlen(src), sizeof (newname)); (void) strlcpy(cn->cn_handle->zfs_name, newname, sizeof (cn->cn_handle->zfs_name)); } uu_avl_walk_end(walk); } /* * Given a gathered changelist for the 'sharenfs' or 'sharesmb' property, * unshare all the datasets in the list. */ int changelist_unshare(prop_changelist_t *clp, zfs_share_proto_t *proto) { prop_changenode_t *cn; uu_avl_walk_t *walk; int ret = 0; if (clp->cl_prop != ZFS_PROP_SHARENFS && clp->cl_prop != ZFS_PROP_SHARESMB) return (0); if ((walk = uu_avl_walk_start(clp->cl_tree, UU_WALK_ROBUST)) == NULL) return (-1); while ((cn = uu_avl_walk_next(walk)) != NULL) { if (zfs_unshare_proto(cn->cn_handle, NULL, proto) != 0) ret = -1; } zfs_commit_proto(proto); uu_avl_walk_end(walk); return (ret); } /* * Check if there is any child exported to a local zone in a given changelist. * This information has already been recorded while gathering the changelist * via changelist_gather(). */ int changelist_haszonedchild(prop_changelist_t *clp) { return (clp->cl_haszonedchild); } /* * Remove a node from a gathered list. */ void changelist_remove(prop_changelist_t *clp, const char *name) { prop_changenode_t *cn; uu_avl_walk_t *walk; if ((walk = uu_avl_walk_start(clp->cl_tree, UU_WALK_ROBUST)) == NULL) return; while ((cn = uu_avl_walk_next(walk)) != NULL) { if (strcmp(cn->cn_handle->zfs_name, name) == 0) { uu_avl_remove(clp->cl_tree, cn); zfs_close(cn->cn_handle); free(cn); uu_avl_walk_end(walk); return; } } uu_avl_walk_end(walk); } /* * Release any memory associated with a changelist. */ void changelist_free(prop_changelist_t *clp) { prop_changenode_t *cn; if (clp->cl_tree) { uu_avl_walk_t *walk; if ((walk = uu_avl_walk_start(clp->cl_tree, UU_WALK_ROBUST)) == NULL) return; while ((cn = uu_avl_walk_next(walk)) != NULL) { uu_avl_remove(clp->cl_tree, cn); zfs_close(cn->cn_handle); free(cn); } uu_avl_walk_end(walk); uu_avl_destroy(clp->cl_tree); } if (clp->cl_pool) uu_avl_pool_destroy(clp->cl_pool); free(clp); } /* * Add one dataset to changelist */ static int changelist_add_mounted(zfs_handle_t *zhp, void *data) { prop_changelist_t *clp = data; prop_changenode_t *cn; uu_avl_index_t idx; ASSERT3U(clp->cl_prop, ==, ZFS_PROP_MOUNTPOINT); if ((cn = zfs_alloc(zfs_get_handle(zhp), sizeof (prop_changenode_t))) == NULL) { zfs_close(zhp); return (ENOMEM); } cn->cn_handle = zhp; cn->cn_mounted = zfs_is_mounted(zhp, NULL); ASSERT3U(cn->cn_mounted, ==, B_TRUE); cn->cn_shared = zfs_is_shared(zhp); cn->cn_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); cn->cn_needpost = B_TRUE; /* Indicate if any child is exported to a local zone. */ if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned) clp->cl_haszonedchild = B_TRUE; uu_avl_node_init(cn, &cn->cn_treenode, clp->cl_pool); if (uu_avl_find(clp->cl_tree, cn, NULL, &idx) == NULL) { uu_avl_insert(clp->cl_tree, cn, idx); } else { free(cn); zfs_close(zhp); } return (0); } static int change_one(zfs_handle_t *zhp, void *data) { prop_changelist_t *clp = data; char property[ZFS_MAXPROPLEN]; char where[64]; prop_changenode_t *cn = NULL; zprop_source_t sourcetype = ZPROP_SRC_NONE; zprop_source_t share_sourcetype = ZPROP_SRC_NONE; int ret = 0; /* * We only want to unmount/unshare those filesystems that may inherit * from the target filesystem. If we find any filesystem with a * locally set mountpoint, we ignore any children since changing the * property will not affect them. If this is a rename, we iterate * over all children regardless, since we need them unmounted in * order to do the rename. Also, if this is a volume and we're doing * a rename, then always add it to the changelist. */ if (!(ZFS_IS_VOLUME(zhp) && clp->cl_realprop == ZFS_PROP_NAME) && zfs_prop_get(zhp, clp->cl_prop, property, sizeof (property), &sourcetype, where, sizeof (where), B_FALSE) != 0) { goto out; } /* * If we are "watching" sharenfs or sharesmb * then check out the companion property which is tracked * in cl_shareprop */ if (clp->cl_shareprop != ZPROP_INVAL && zfs_prop_get(zhp, clp->cl_shareprop, property, sizeof (property), &share_sourcetype, where, sizeof (where), B_FALSE) != 0) { goto out; } if (clp->cl_alldependents || clp->cl_allchildren || sourcetype == ZPROP_SRC_DEFAULT || sourcetype == ZPROP_SRC_INHERITED || (clp->cl_shareprop != ZPROP_INVAL && (share_sourcetype == ZPROP_SRC_DEFAULT || share_sourcetype == ZPROP_SRC_INHERITED))) { if ((cn = zfs_alloc(zfs_get_handle(zhp), sizeof (prop_changenode_t))) == NULL) { ret = -1; goto out; } cn->cn_handle = zhp; cn->cn_mounted = (clp->cl_gflags & CL_GATHER_MOUNT_ALWAYS) || zfs_is_mounted(zhp, NULL); cn->cn_shared = zfs_is_shared(zhp); cn->cn_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); cn->cn_needpost = B_TRUE; /* Indicate if any child is exported to a local zone. */ if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned) clp->cl_haszonedchild = B_TRUE; uu_avl_node_init(cn, &cn->cn_treenode, clp->cl_pool); uu_avl_index_t idx; if (uu_avl_find(clp->cl_tree, cn, NULL, &idx) == NULL) { uu_avl_insert(clp->cl_tree, cn, idx); } else { free(cn); cn = NULL; } if (!clp->cl_alldependents) ret = zfs_iter_children(zhp, change_one, data); /* * If we added the handle to the changelist, we will re-use it * later so return without closing it. */ if (cn != NULL) return (ret); } out: zfs_close(zhp); return (ret); } static int compare_props(const void *a, const void *b, zfs_prop_t prop) { const prop_changenode_t *ca = a; const prop_changenode_t *cb = b; char propa[MAXPATHLEN]; char propb[MAXPATHLEN]; boolean_t haspropa, haspropb; haspropa = (zfs_prop_get(ca->cn_handle, prop, propa, sizeof (propa), NULL, NULL, 0, B_FALSE) == 0); haspropb = (zfs_prop_get(cb->cn_handle, prop, propb, sizeof (propb), NULL, NULL, 0, B_FALSE) == 0); if (!haspropa && haspropb) return (-1); else if (haspropa && !haspropb) return (1); else if (!haspropa && !haspropb) return (0); else return (strcmp(propb, propa)); } /*ARGSUSED*/ static int compare_mountpoints(const void *a, const void *b, void *unused) { /* * When unsharing or unmounting filesystems, we need to do it in * mountpoint order. This allows the user to have a mountpoint * hierarchy that is different from the dataset hierarchy, and still * allow it to be changed. */ return (compare_props(a, b, ZFS_PROP_MOUNTPOINT)); } /*ARGSUSED*/ static int compare_dataset_names(const void *a, const void *b, void *unused) { return (compare_props(a, b, ZFS_PROP_NAME)); } /* * Given a ZFS handle and a property, construct a complete list of datasets * that need to be modified as part of this process. For anything but the * 'mountpoint' and 'sharenfs' properties, this just returns an empty list. * Otherwise, we iterate over all children and look for any datasets that * inherit the property. For each such dataset, we add it to the list and * mark whether it was shared beforehand. */ prop_changelist_t * changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags, int mnt_flags) { prop_changelist_t *clp; prop_changenode_t *cn; zfs_handle_t *temp; char property[ZFS_MAXPROPLEN]; boolean_t legacy = B_FALSE; if ((clp = zfs_alloc(zhp->zfs_hdl, sizeof (prop_changelist_t))) == NULL) return (NULL); /* * For mountpoint-related tasks, we want to sort everything by * mountpoint, so that we mount and unmount them in the appropriate * order, regardless of their position in the hierarchy. */ if (prop == ZFS_PROP_NAME || prop == ZFS_PROP_ZONED || prop == ZFS_PROP_MOUNTPOINT || prop == ZFS_PROP_SHARENFS || prop == ZFS_PROP_SHARESMB) { if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, property, sizeof (property), NULL, NULL, 0, B_FALSE) == 0 && (strcmp(property, "legacy") == 0 || strcmp(property, "none") == 0)) { legacy = B_TRUE; } } clp->cl_pool = uu_avl_pool_create("changelist_pool", sizeof (prop_changenode_t), offsetof(prop_changenode_t, cn_treenode), legacy ? compare_dataset_names : compare_mountpoints, 0); if (clp->cl_pool == NULL) { assert(uu_error() == UU_ERROR_NO_MEMORY); (void) zfs_error(zhp->zfs_hdl, EZFS_NOMEM, "internal error"); changelist_free(clp); return (NULL); } clp->cl_tree = uu_avl_create(clp->cl_pool, NULL, UU_DEFAULT); clp->cl_gflags = gather_flags; clp->cl_mflags = mnt_flags; if (clp->cl_tree == NULL) { assert(uu_error() == UU_ERROR_NO_MEMORY); (void) zfs_error(zhp->zfs_hdl, EZFS_NOMEM, "internal error"); changelist_free(clp); return (NULL); } /* * If this is a rename or the 'zoned' property, we pretend we're * changing the mountpoint and flag it so we can catch all children in * change_one(). * * Flag cl_alldependents to catch all children plus the dependents * (clones) that are not in the hierarchy. */ if (prop == ZFS_PROP_NAME) { clp->cl_prop = ZFS_PROP_MOUNTPOINT; clp->cl_alldependents = B_TRUE; } else if (prop == ZFS_PROP_ZONED) { clp->cl_prop = ZFS_PROP_MOUNTPOINT; clp->cl_allchildren = B_TRUE; } else if (prop == ZFS_PROP_CANMOUNT) { clp->cl_prop = ZFS_PROP_MOUNTPOINT; } else if (prop == ZFS_PROP_VOLSIZE) { clp->cl_prop = ZFS_PROP_MOUNTPOINT; } else { clp->cl_prop = prop; } clp->cl_realprop = prop; if (clp->cl_prop != ZFS_PROP_MOUNTPOINT && clp->cl_prop != ZFS_PROP_SHARENFS && clp->cl_prop != ZFS_PROP_SHARESMB) return (clp); /* * If watching SHARENFS or SHARESMB then * also watch its companion property. */ if (clp->cl_prop == ZFS_PROP_SHARENFS) clp->cl_shareprop = ZFS_PROP_SHARESMB; else if (clp->cl_prop == ZFS_PROP_SHARESMB) clp->cl_shareprop = ZFS_PROP_SHARENFS; if (clp->cl_prop == ZFS_PROP_MOUNTPOINT && (clp->cl_gflags & CL_GATHER_ITER_MOUNTED)) { /* * Instead of iterating through all of the dataset children we * gather mounted dataset children from MNTTAB */ if (zfs_iter_mounted(zhp, changelist_add_mounted, clp) != 0) { changelist_free(clp); return (NULL); } } else if (clp->cl_alldependents) { if (zfs_iter_dependents(zhp, B_TRUE, change_one, clp) != 0) { changelist_free(clp); return (NULL); } } else if (zfs_iter_children(zhp, change_one, clp) != 0) { changelist_free(clp); return (NULL); } /* * We have to re-open ourselves because we auto-close all the handles * and can't tell the difference. */ if ((temp = zfs_open(zhp->zfs_hdl, zfs_get_name(zhp), ZFS_TYPE_DATASET)) == NULL) { changelist_free(clp); return (NULL); } /* * Always add ourself to the list. We add ourselves to the end so that * we're the last to be unmounted. */ if ((cn = zfs_alloc(zhp->zfs_hdl, sizeof (prop_changenode_t))) == NULL) { zfs_close(temp); changelist_free(clp); return (NULL); } cn->cn_handle = temp; cn->cn_mounted = (clp->cl_gflags & CL_GATHER_MOUNT_ALWAYS) || zfs_is_mounted(temp, NULL); cn->cn_shared = zfs_is_shared(temp); cn->cn_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); cn->cn_needpost = B_TRUE; uu_avl_node_init(cn, &cn->cn_treenode, clp->cl_pool); uu_avl_index_t idx; if (uu_avl_find(clp->cl_tree, cn, NULL, &idx) == NULL) { uu_avl_insert(clp->cl_tree, cn, idx); } else { free(cn); zfs_close(temp); } /* * If the mountpoint property was previously 'legacy', or 'none', * record it as the behavior of changelist_postfix() will be different. */ if ((clp->cl_prop == ZFS_PROP_MOUNTPOINT) && legacy) { /* * do not automatically mount ex-legacy datasets if * we specifically set canmount to noauto */ if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) != ZFS_CANMOUNT_NOAUTO) clp->cl_waslegacy = B_TRUE; } return (clp); } Index: vendor-sys/openzfs/dist/lib/libzfs/libzfs_dataset.c =================================================================== --- vendor-sys/openzfs/dist/lib/libzfs/libzfs_dataset.c (revision 365339) +++ vendor-sys/openzfs/dist/lib/libzfs/libzfs_dataset.c (revision 365340) @@ -1,5492 +1,5506 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2019 Joyent, Inc. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . * Copyright (c) 2013 Martin Matuska. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright 2016 Igor Kozhukhov * Copyright 2017-2018 RackTop Systems. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, loli10K */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_IDMAP #include #include #include #endif /* HAVE_IDMAP */ #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "libzfs_impl.h" #include "libzfs.h" #include "zfs_deleg.h" static int userquota_propname_decode(const char *propname, boolean_t zoned, zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp); /* * Given a single type (not a mask of types), return the type in a human * readable form. */ const char * zfs_type_to_name(zfs_type_t type) { switch (type) { case ZFS_TYPE_FILESYSTEM: return (dgettext(TEXT_DOMAIN, "filesystem")); case ZFS_TYPE_SNAPSHOT: return (dgettext(TEXT_DOMAIN, "snapshot")); case ZFS_TYPE_VOLUME: return (dgettext(TEXT_DOMAIN, "volume")); case ZFS_TYPE_POOL: return (dgettext(TEXT_DOMAIN, "pool")); case ZFS_TYPE_BOOKMARK: return (dgettext(TEXT_DOMAIN, "bookmark")); default: assert(!"unhandled zfs_type_t"); } return (NULL); } /* * Validate a ZFS path. This is used even before trying to open the dataset, to * provide a more meaningful error message. We call zfs_error_aux() to * explain exactly why the name was not valid. */ int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, boolean_t modifying) { namecheck_err_t why; char what; if (!(type & ZFS_TYPE_SNAPSHOT) && strchr(path, '@') != NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshot delimiter '@' is not expected here")); return (0); } if (type == ZFS_TYPE_SNAPSHOT && strchr(path, '@') == NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing '@' delimiter in snapshot name")); return (0); } if (!(type & ZFS_TYPE_BOOKMARK) && strchr(path, '#') != NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "bookmark delimiter '#' is not expected here")); return (0); } if (type == ZFS_TYPE_BOOKMARK && strchr(path, '#') == NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing '#' delimiter in bookmark name")); return (0); } if (modifying && strchr(path, '%') != NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid character %c in name"), '%'); return (0); } if (entity_namecheck(path, &why, &what) != 0) { if (hdl != NULL) { switch (why) { case NAME_ERR_TOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is too long")); break; case NAME_ERR_LEADING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "leading slash in name")); break; case NAME_ERR_EMPTY_COMPONENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "empty component or misplaced '@'" " or '#' delimiter in name")); break; case NAME_ERR_TRAILING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "trailing slash in name")); break; case NAME_ERR_INVALCHAR: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid character " "'%c' in name"), what); break; case NAME_ERR_MULTIPLE_DELIMITERS: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "multiple '@' and/or '#' delimiters in " "name")); break; case NAME_ERR_NOLETTER: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool doesn't begin with a letter")); break; case NAME_ERR_RESERVED: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is reserved")); break; case NAME_ERR_DISKLIKE: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "reserved disk name")); break; case NAME_ERR_SELF_REF: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "self reference, '.' is found in name")); break; case NAME_ERR_PARENT_REF: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent reference, '..' is found in name")); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "(%d) not defined"), why); break; } } return (0); } return (-1); } int zfs_name_valid(const char *name, zfs_type_t type) { if (type == ZFS_TYPE_POOL) return (zpool_name_valid(NULL, B_FALSE, name)); return (zfs_validate_name(NULL, name, type, B_FALSE)); } /* * This function takes the raw DSL properties, and filters out the user-defined * properties into a separate nvlist. */ static nvlist_t * process_user_props(zfs_handle_t *zhp, nvlist_t *props) { libzfs_handle_t *hdl = zhp->zfs_hdl; nvpair_t *elem; nvlist_t *propval; nvlist_t *nvl; if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (NULL); } elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { if (!zfs_prop_user(nvpair_name(elem))) continue; verify(nvpair_value_nvlist(elem, &propval) == 0); if (nvlist_add_nvlist(nvl, nvpair_name(elem), propval) != 0) { nvlist_free(nvl); (void) no_memory(hdl); return (NULL); } } return (nvl); } static zpool_handle_t * zpool_add_handle(zfs_handle_t *zhp, const char *pool_name) { libzfs_handle_t *hdl = zhp->zfs_hdl; zpool_handle_t *zph; if ((zph = zpool_open_canfail(hdl, pool_name)) != NULL) { if (hdl->libzfs_pool_handles != NULL) zph->zpool_next = hdl->libzfs_pool_handles; hdl->libzfs_pool_handles = zph; } return (zph); } static zpool_handle_t * zpool_find_handle(zfs_handle_t *zhp, const char *pool_name, int len) { libzfs_handle_t *hdl = zhp->zfs_hdl; zpool_handle_t *zph = hdl->libzfs_pool_handles; while ((zph != NULL) && (strncmp(pool_name, zpool_get_name(zph), len) != 0)) zph = zph->zpool_next; return (zph); } /* * Returns a handle to the pool that contains the provided dataset. * If a handle to that pool already exists then that handle is returned. * Otherwise, a new handle is created and added to the list of handles. */ static zpool_handle_t * zpool_handle(zfs_handle_t *zhp) { char *pool_name; int len; zpool_handle_t *zph; len = strcspn(zhp->zfs_name, "/@#") + 1; pool_name = zfs_alloc(zhp->zfs_hdl, len); (void) strlcpy(pool_name, zhp->zfs_name, len); zph = zpool_find_handle(zhp, pool_name, len); if (zph == NULL) zph = zpool_add_handle(zhp, pool_name); free(pool_name); return (zph); } void zpool_free_handles(libzfs_handle_t *hdl) { zpool_handle_t *next, *zph = hdl->libzfs_pool_handles; while (zph != NULL) { next = zph->zpool_next; zpool_close(zph); zph = next; } hdl->libzfs_pool_handles = NULL; } /* * Utility function to gather stats (objset and zpl) for the given object. */ static int get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc) { libzfs_handle_t *hdl = zhp->zfs_hdl; (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); while (zfs_ioctl(hdl, ZFS_IOC_OBJSET_STATS, zc) != 0) { if (errno == ENOMEM) { if (zcmd_expand_dst_nvlist(hdl, zc) != 0) { return (-1); } } else { return (-1); } } return (0); } /* * Utility function to get the received properties of the given object. */ static int get_recvd_props_ioctl(zfs_handle_t *zhp) { libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *recvdprops; zfs_cmd_t zc = {"\0"}; int err; if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) return (-1); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); while (zfs_ioctl(hdl, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) { if (errno == ENOMEM) { if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { return (-1); } } else { zcmd_free_nvlists(&zc); return (-1); } } err = zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &recvdprops); zcmd_free_nvlists(&zc); if (err != 0) return (-1); nvlist_free(zhp->zfs_recvd_props); zhp->zfs_recvd_props = recvdprops; return (0); } static int put_stats_zhdl(zfs_handle_t *zhp, zfs_cmd_t *zc) { nvlist_t *allprops, *userprops; zhp->zfs_dmustats = zc->zc_objset_stats; /* structure assignment */ if (zcmd_read_dst_nvlist(zhp->zfs_hdl, zc, &allprops) != 0) { return (-1); } /* * XXX Why do we store the user props separately, in addition to * storing them in zfs_props? */ if ((userprops = process_user_props(zhp, allprops)) == NULL) { nvlist_free(allprops); return (-1); } nvlist_free(zhp->zfs_props); nvlist_free(zhp->zfs_user_props); zhp->zfs_props = allprops; zhp->zfs_user_props = userprops; return (0); } static int get_stats(zfs_handle_t *zhp) { int rc = 0; zfs_cmd_t zc = {"\0"}; if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) return (-1); if (get_stats_ioctl(zhp, &zc) != 0) rc = -1; else if (put_stats_zhdl(zhp, &zc) != 0) rc = -1; zcmd_free_nvlists(&zc); return (rc); } /* * Refresh the properties currently stored in the handle. */ void zfs_refresh_properties(zfs_handle_t *zhp) { (void) get_stats(zhp); } /* * Makes a handle from the given dataset name. Used by zfs_open() and * zfs_iter_* to create child handles on the fly. */ static int make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc) { if (put_stats_zhdl(zhp, zc) != 0) return (-1); /* * We've managed to open the dataset and gather statistics. Determine * the high-level type. */ if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) zhp->zfs_head_type = ZFS_TYPE_VOLUME; else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS) zhp->zfs_head_type = ZFS_TYPE_FILESYSTEM; else if (zhp->zfs_dmustats.dds_type == DMU_OST_OTHER) return (-1); else abort(); if (zhp->zfs_dmustats.dds_is_snapshot) zhp->zfs_type = ZFS_TYPE_SNAPSHOT; else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) zhp->zfs_type = ZFS_TYPE_VOLUME; else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS) zhp->zfs_type = ZFS_TYPE_FILESYSTEM; else abort(); /* we should never see any other types */ if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) return (-1); return (0); } zfs_handle_t * make_dataset_handle(libzfs_handle_t *hdl, const char *path) { zfs_cmd_t zc = {"\0"}; zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = hdl; (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) { free(zhp); return (NULL); } if (get_stats_ioctl(zhp, &zc) == -1) { zcmd_free_nvlists(&zc); free(zhp); return (NULL); } if (make_dataset_handle_common(zhp, &zc) == -1) { free(zhp); zhp = NULL; } zcmd_free_nvlists(&zc); return (zhp); } zfs_handle_t * make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = hdl; (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name)); if (make_dataset_handle_common(zhp, zc) == -1) { free(zhp); return (NULL); } return (zhp); } zfs_handle_t * make_dataset_simple_handle_zc(zfs_handle_t *pzhp, zfs_cmd_t *zc) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = pzhp->zfs_hdl; (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name)); zhp->zfs_head_type = pzhp->zfs_type; zhp->zfs_type = ZFS_TYPE_SNAPSHOT; zhp->zpool_hdl = zpool_handle(zhp); return (zhp); } zfs_handle_t * zfs_handle_dup(zfs_handle_t *zhp_orig) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = zhp_orig->zfs_hdl; zhp->zpool_hdl = zhp_orig->zpool_hdl; (void) strlcpy(zhp->zfs_name, zhp_orig->zfs_name, sizeof (zhp->zfs_name)); zhp->zfs_type = zhp_orig->zfs_type; zhp->zfs_head_type = zhp_orig->zfs_head_type; zhp->zfs_dmustats = zhp_orig->zfs_dmustats; if (zhp_orig->zfs_props != NULL) { if (nvlist_dup(zhp_orig->zfs_props, &zhp->zfs_props, 0) != 0) { (void) no_memory(zhp->zfs_hdl); zfs_close(zhp); return (NULL); } } if (zhp_orig->zfs_user_props != NULL) { if (nvlist_dup(zhp_orig->zfs_user_props, &zhp->zfs_user_props, 0) != 0) { (void) no_memory(zhp->zfs_hdl); zfs_close(zhp); return (NULL); } } if (zhp_orig->zfs_recvd_props != NULL) { if (nvlist_dup(zhp_orig->zfs_recvd_props, &zhp->zfs_recvd_props, 0)) { (void) no_memory(zhp->zfs_hdl); zfs_close(zhp); return (NULL); } } zhp->zfs_mntcheck = zhp_orig->zfs_mntcheck; if (zhp_orig->zfs_mntopts != NULL) { zhp->zfs_mntopts = zfs_strdup(zhp_orig->zfs_hdl, zhp_orig->zfs_mntopts); } zhp->zfs_props_table = zhp_orig->zfs_props_table; return (zhp); } boolean_t zfs_bookmark_exists(const char *path) { nvlist_t *bmarks; nvlist_t *props; char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *bmark_name; char *pound; int err; boolean_t rv; (void) strlcpy(fsname, path, sizeof (fsname)); pound = strchr(fsname, '#'); if (pound == NULL) return (B_FALSE); *pound = '\0'; bmark_name = pound + 1; props = fnvlist_alloc(); err = lzc_get_bookmarks(fsname, props, &bmarks); nvlist_free(props); if (err != 0) { nvlist_free(bmarks); return (B_FALSE); } rv = nvlist_exists(bmarks, bmark_name); nvlist_free(bmarks); return (rv); } zfs_handle_t * make_bookmark_handle(zfs_handle_t *parent, const char *path, nvlist_t *bmark_props) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); /* Fill in the name. */ zhp->zfs_hdl = parent->zfs_hdl; (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); /* Set the property lists. */ if (nvlist_dup(bmark_props, &zhp->zfs_props, 0) != 0) { free(zhp); return (NULL); } /* Set the types. */ zhp->zfs_head_type = parent->zfs_head_type; zhp->zfs_type = ZFS_TYPE_BOOKMARK; if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) { nvlist_free(zhp->zfs_props); free(zhp); return (NULL); } return (zhp); } struct zfs_open_bookmarks_cb_data { const char *path; zfs_handle_t *zhp; }; static int zfs_open_bookmarks_cb(zfs_handle_t *zhp, void *data) { struct zfs_open_bookmarks_cb_data *dp = data; /* * Is it the one we are looking for? */ if (strcmp(dp->path, zfs_get_name(zhp)) == 0) { /* * We found it. Save it and let the caller know we are done. */ dp->zhp = zhp; return (EEXIST); } /* * Not found. Close the handle and ask for another one. */ zfs_close(zhp); return (0); } /* * Opens the given snapshot, bookmark, filesystem, or volume. The 'types' * argument is a mask of acceptable types. The function will print an * appropriate error message and return NULL if it can't be opened. */ zfs_handle_t * zfs_open(libzfs_handle_t *hdl, const char *path, int types) { zfs_handle_t *zhp; char errbuf[1024]; char *bookp; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot open '%s'"), path); /* * Validate the name before we even try to open it. */ if (!zfs_validate_name(hdl, path, types, B_FALSE)) { (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); return (NULL); } /* * Bookmarks needs to be handled separately. */ bookp = strchr(path, '#'); if (bookp == NULL) { /* * Try to get stats for the dataset, which will tell us if it * exists. */ errno = 0; if ((zhp = make_dataset_handle(hdl, path)) == NULL) { (void) zfs_standard_error(hdl, errno, errbuf); return (NULL); } } else { char dsname[ZFS_MAX_DATASET_NAME_LEN]; zfs_handle_t *pzhp; struct zfs_open_bookmarks_cb_data cb_data = {path, NULL}; /* * We need to cut out '#' and everything after '#' * to get the parent dataset name only. */ assert(bookp - path < sizeof (dsname)); (void) strncpy(dsname, path, bookp - path); dsname[bookp - path] = '\0'; /* * Create handle for the parent dataset. */ errno = 0; if ((pzhp = make_dataset_handle(hdl, dsname)) == NULL) { (void) zfs_standard_error(hdl, errno, errbuf); return (NULL); } /* * Iterate bookmarks to find the right one. */ errno = 0; if ((zfs_iter_bookmarks(pzhp, zfs_open_bookmarks_cb, &cb_data) == 0) && (cb_data.zhp == NULL)) { (void) zfs_error(hdl, EZFS_NOENT, errbuf); zfs_close(pzhp); return (NULL); } if (cb_data.zhp == NULL) { (void) zfs_standard_error(hdl, errno, errbuf); zfs_close(pzhp); return (NULL); } zhp = cb_data.zhp; /* * Cleanup. */ zfs_close(pzhp); } if (!(types & zhp->zfs_type)) { (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); zfs_close(zhp); return (NULL); } return (zhp); } /* * Release a ZFS handle. Nothing to do but free the associated memory. */ void zfs_close(zfs_handle_t *zhp) { if (zhp->zfs_mntopts) free(zhp->zfs_mntopts); nvlist_free(zhp->zfs_props); nvlist_free(zhp->zfs_user_props); nvlist_free(zhp->zfs_recvd_props); free(zhp); } typedef struct mnttab_node { struct mnttab mtn_mt; avl_node_t mtn_node; } mnttab_node_t; static int libzfs_mnttab_cache_compare(const void *arg1, const void *arg2) { const mnttab_node_t *mtn1 = (const mnttab_node_t *)arg1; const mnttab_node_t *mtn2 = (const mnttab_node_t *)arg2; int rv; rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special); return (TREE_ISIGN(rv)); } void libzfs_mnttab_init(libzfs_handle_t *hdl) { pthread_mutex_init(&hdl->libzfs_mnttab_cache_lock, NULL); assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0); avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare, sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node)); } static int libzfs_mnttab_update(libzfs_handle_t *hdl) { struct mnttab entry; /* Reopen MNTTAB to prevent reading stale data from open file */ if (freopen(MNTTAB, "r", hdl->libzfs_mnttab) == NULL) return (ENOENT); while (getmntent(hdl->libzfs_mnttab, &entry) == 0) { mnttab_node_t *mtn; avl_index_t where; if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) continue; mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); mtn->mtn_mt.mnt_special = zfs_strdup(hdl, entry.mnt_special); mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, entry.mnt_mountp); mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, entry.mnt_fstype); mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts); /* Exclude duplicate mounts */ if (avl_find(&hdl->libzfs_mnttab_cache, mtn, &where) != NULL) { free(mtn->mtn_mt.mnt_special); free(mtn->mtn_mt.mnt_mountp); free(mtn->mtn_mt.mnt_fstype); free(mtn->mtn_mt.mnt_mntopts); free(mtn); continue; } avl_add(&hdl->libzfs_mnttab_cache, mtn); } return (0); } void libzfs_mnttab_fini(libzfs_handle_t *hdl) { void *cookie = NULL; mnttab_node_t *mtn; while ((mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie)) != NULL) { free(mtn->mtn_mt.mnt_special); free(mtn->mtn_mt.mnt_mountp); free(mtn->mtn_mt.mnt_fstype); free(mtn->mtn_mt.mnt_mntopts); free(mtn); } avl_destroy(&hdl->libzfs_mnttab_cache); (void) pthread_mutex_destroy(&hdl->libzfs_mnttab_cache_lock); } void libzfs_mnttab_cache(libzfs_handle_t *hdl, boolean_t enable) { hdl->libzfs_mnttab_enable = enable; } int libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname, struct mnttab *entry) { mnttab_node_t find; mnttab_node_t *mtn; int ret = ENOENT; if (!hdl->libzfs_mnttab_enable) { struct mnttab srch = { 0 }; if (avl_numnodes(&hdl->libzfs_mnttab_cache)) libzfs_mnttab_fini(hdl); /* Reopen MNTTAB to prevent reading stale data from open file */ if (freopen(MNTTAB, "r", hdl->libzfs_mnttab) == NULL) return (ENOENT); srch.mnt_special = (char *)fsname; srch.mnt_fstype = MNTTYPE_ZFS; if (getmntany(hdl->libzfs_mnttab, entry, &srch) == 0) return (0); else return (ENOENT); } pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) { int error; if ((error = libzfs_mnttab_update(hdl)) != 0) { pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); return (error); } } find.mtn_mt.mnt_special = (char *)fsname; mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL); if (mtn) { *entry = mtn->mtn_mt; ret = 0; } pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); return (ret); } void libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special, const char *mountp, const char *mntopts) { mnttab_node_t *mtn; pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); if (avl_numnodes(&hdl->libzfs_mnttab_cache) != 0) { mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special); mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp); mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS); mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts); /* * Another thread may have already added this entry * via libzfs_mnttab_update. If so we should skip it. */ if (avl_find(&hdl->libzfs_mnttab_cache, mtn, NULL) != NULL) { free(mtn->mtn_mt.mnt_special); free(mtn->mtn_mt.mnt_mountp); free(mtn->mtn_mt.mnt_fstype); free(mtn->mtn_mt.mnt_mntopts); free(mtn); } else { avl_add(&hdl->libzfs_mnttab_cache, mtn); } } pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); } void libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname) { mnttab_node_t find; mnttab_node_t *ret; pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); find.mtn_mt.mnt_special = (char *)fsname; if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) != NULL) { avl_remove(&hdl->libzfs_mnttab_cache, ret); free(ret->mtn_mt.mnt_special); free(ret->mtn_mt.mnt_mountp); free(ret->mtn_mt.mnt_fstype); free(ret->mtn_mt.mnt_mntopts); free(ret); } pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); } int zfs_spa_version(zfs_handle_t *zhp, int *spa_version) { zpool_handle_t *zpool_handle = zhp->zpool_hdl; if (zpool_handle == NULL) return (-1); *spa_version = zpool_get_prop_int(zpool_handle, ZPOOL_PROP_VERSION, NULL); return (0); } /* * The choice of reservation property depends on the SPA version. */ static int zfs_which_resv_prop(zfs_handle_t *zhp, zfs_prop_t *resv_prop) { int spa_version; if (zfs_spa_version(zhp, &spa_version) < 0) return (-1); if (spa_version >= SPA_VERSION_REFRESERVATION) *resv_prop = ZFS_PROP_REFRESERVATION; else *resv_prop = ZFS_PROP_RESERVATION; return (0); } /* * Given an nvlist of properties to set, validates that they are correct, and * parses any numeric properties (index, boolean, etc) if they are specified as * strings. */ nvlist_t * zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, uint64_t zoned, zfs_handle_t *zhp, zpool_handle_t *zpool_hdl, boolean_t key_params_ok, const char *errbuf) { nvpair_t *elem; uint64_t intval; char *strval; zfs_prop_t prop; nvlist_t *ret; int chosen_normal = -1; int chosen_utf = -1; if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (NULL); } /* * Make sure this property is valid and applies to this type. */ elem = NULL; while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { const char *propname = nvpair_name(elem); prop = zfs_name_to_prop(propname); if (prop == ZPROP_INVAL && zfs_prop_user(propname)) { /* * This is a user property: make sure it's a * string, and that it's less than ZAP_MAXNAMELEN. */ if (nvpair_type(elem) != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property name '%s' is too long"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } (void) nvpair_value_string(elem, &strval); if (nvlist_add_string(ret, propname, strval) != 0) { (void) no_memory(hdl); goto error; } continue; } /* * Currently, only user properties can be modified on * snapshots. */ if (type == ZFS_TYPE_SNAPSHOT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "this property can not be modified for snapshots")); (void) zfs_error(hdl, EZFS_PROPTYPE, errbuf); goto error; } if (prop == ZPROP_INVAL && zfs_prop_userquota(propname)) { zfs_userquota_prop_t uqtype; char *newpropname = NULL; char domain[128]; uint64_t rid; uint64_t valary[3]; int rc; if (userquota_propname_decode(propname, zoned, &uqtype, domain, sizeof (domain), &rid) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' has an invalid user/group name"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (uqtype != ZFS_PROP_USERQUOTA && uqtype != ZFS_PROP_GROUPQUOTA && uqtype != ZFS_PROP_USEROBJQUOTA && uqtype != ZFS_PROP_GROUPOBJQUOTA && uqtype != ZFS_PROP_PROJECTQUOTA && uqtype != ZFS_PROP_PROJECTOBJQUOTA) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (nvpair_type(elem) == DATA_TYPE_STRING) { (void) nvpair_value_string(elem, &strval); if (strcmp(strval, "none") == 0) { intval = 0; } else if (zfs_nicestrtonum(hdl, strval, &intval) != 0) { (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { (void) nvpair_value_uint64(elem, &intval); if (intval == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "use 'none' to disable " "{user|group|project}quota")); goto error; } } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a number"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } /* * Encode the prop name as * userquota@-domain, to make it easy * for the kernel to decode. */ rc = asprintf(&newpropname, "%s%llx-%s", zfs_userquota_prop_prefixes[uqtype], (longlong_t)rid, domain); if (rc == -1 || newpropname == NULL) { (void) no_memory(hdl); goto error; } valary[0] = uqtype; valary[1] = rid; valary[2] = intval; if (nvlist_add_uint64_array(ret, newpropname, valary, 3) != 0) { free(newpropname); (void) no_memory(hdl); goto error; } free(newpropname); continue; } else if (prop == ZPROP_INVAL && zfs_prop_written(propname)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (prop == ZPROP_INVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (!zfs_prop_valid_for_type(prop, type, B_FALSE)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' does not " "apply to datasets of this type"), propname); (void) zfs_error(hdl, EZFS_PROPTYPE, errbuf); goto error; } if (zfs_prop_readonly(prop) && !(zfs_prop_setonce(prop) && zhp == NULL) && !(zfs_prop_encryption_key_param(prop) && key_params_ok)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (zprop_parse_value(hdl, elem, prop, type, ret, &strval, &intval, errbuf) != 0) goto error; /* * Perform some additional checks for specific properties. */ switch (prop) { case ZFS_PROP_VERSION: { int version; if (zhp == NULL) break; version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); if (intval < version) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Can not downgrade; already at version %u"), version); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; } case ZFS_PROP_VOLBLOCKSIZE: case ZFS_PROP_RECORDSIZE: { int maxbs = SPA_MAXBLOCKSIZE; char buf[64]; if (zpool_hdl != NULL) { maxbs = zpool_get_prop_int(zpool_hdl, ZPOOL_PROP_MAXBLOCKSIZE, NULL); } /* * The value must be a power of two between * SPA_MINBLOCKSIZE and maxbs. */ if (intval < SPA_MINBLOCKSIZE || intval > maxbs || !ISP2(intval)) { zfs_nicebytes(maxbs, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be power of 2 from 512B " "to %s"), propname, buf); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; } case ZFS_PROP_SPECIAL_SMALL_BLOCKS: { int maxbs = SPA_OLD_MAXBLOCKSIZE; char buf[64]; if (zpool_hdl != NULL) { char state[64] = ""; maxbs = zpool_get_prop_int(zpool_hdl, ZPOOL_PROP_MAXBLOCKSIZE, NULL); /* * Issue a warning but do not fail so that * tests for settable properties succeed. */ if (zpool_prop_get_feature(zpool_hdl, "feature@allocation_classes", state, sizeof (state)) != 0 || strcmp(state, ZFS_FEATURE_ACTIVE) != 0) { (void) fprintf(stderr, gettext( "%s: property requires a special " "device in the pool\n"), propname); } } if (intval != 0 && (intval < SPA_MINBLOCKSIZE || intval > maxbs || !ISP2(intval))) { zfs_nicebytes(maxbs, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid '%s=%d' property: must be zero or " "a power of 2 from 512B to %s"), propname, intval, buf); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; } case ZFS_PROP_MLSLABEL: { #ifdef HAVE_MLSLABEL /* * Verify the mlslabel string and convert to * internal hex label string. */ m_label_t *new_sl; char *hex = NULL; /* internal label string */ /* Default value is already OK. */ if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) break; /* Verify the label can be converted to binary form */ if (((new_sl = m_label_alloc(MAC_LABEL)) == NULL) || (str_to_label(strval, &new_sl, MAC_LABEL, L_NO_CORRECTION, NULL) == -1)) { goto badlabel; } /* Now translate to hex internal label string */ if (label_to_str(new_sl, &hex, M_INTERNAL, DEF_NAMES) != 0) { if (hex) free(hex); goto badlabel; } m_label_free(new_sl); /* If string is already in internal form, we're done. */ if (strcmp(strval, hex) == 0) { free(hex); break; } /* Replace the label string with the internal form. */ (void) nvlist_remove(ret, zfs_prop_to_name(prop), DATA_TYPE_STRING); verify(nvlist_add_string(ret, zfs_prop_to_name(prop), hex) == 0); free(hex); break; badlabel: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid mlslabel '%s'"), strval); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); m_label_free(new_sl); /* OK if null */ goto error; #else zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "mlslabels are unsupported")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; #endif /* HAVE_MLSLABEL */ } case ZFS_PROP_MOUNTPOINT: { namecheck_err_t why; if (strcmp(strval, ZFS_MOUNTPOINT_NONE) == 0 || strcmp(strval, ZFS_MOUNTPOINT_LEGACY) == 0) break; if (mountpoint_namecheck(strval, &why)) { switch (why) { case NAME_ERR_LEADING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be an absolute path, " "'none', or 'legacy'"), propname); break; case NAME_ERR_TOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "component of '%s' is too long"), propname); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "(%d) not defined"), why); break; } (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } /*FALLTHRU*/ case ZFS_PROP_SHARESMB: case ZFS_PROP_SHARENFS: /* * For the mountpoint and sharenfs or sharesmb * properties, check if it can be set in a * global/non-global zone based on * the zoned property value: * * global zone non-global zone * -------------------------------------------------- * zoned=on mountpoint (no) mountpoint (yes) * sharenfs (no) sharenfs (no) * sharesmb (no) sharesmb (no) * * zoned=off mountpoint (yes) N/A * sharenfs (yes) * sharesmb (yes) */ if (zoned) { if (getzoneid() == GLOBAL_ZONEID) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set on " "dataset in a non-global zone"), propname); (void) zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } else if (prop == ZFS_PROP_SHARENFS || prop == ZFS_PROP_SHARESMB) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set in " "a non-global zone"), propname); (void) zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } } else if (getzoneid() != GLOBAL_ZONEID) { /* * If zoned property is 'off', this must be in * a global zone. If not, something is wrong. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set while dataset " "'zoned' property is set"), propname); (void) zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } /* * At this point, it is legitimate to set the * property. Now we want to make sure that the * property value is valid if it is sharenfs. */ if ((prop == ZFS_PROP_SHARENFS || prop == ZFS_PROP_SHARESMB) && strcmp(strval, "on") != 0 && strcmp(strval, "off") != 0) { zfs_share_proto_t proto; if (prop == ZFS_PROP_SHARESMB) proto = PROTO_SMB; else proto = PROTO_NFS; if (zfs_parse_options(strval, proto) != SA_OK) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set to invalid " "options"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } break; case ZFS_PROP_KEYLOCATION: if (!zfs_prop_valid_keylocation(strval, B_FALSE)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid keylocation")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (zhp != NULL) { uint64_t crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); if (crypt == ZIO_CRYPT_OFF && strcmp(strval, "none") != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "keylocation must be 'none' " "for unencrypted datasets")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } else if (crypt != ZIO_CRYPT_OFF && strcmp(strval, "none") == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "keylocation must not be 'none' " "for encrypted datasets")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } break; case ZFS_PROP_PBKDF2_ITERS: if (intval < MIN_PBKDF2_ITERATIONS) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "minimum pbkdf2 iterations is %u"), MIN_PBKDF2_ITERATIONS); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZFS_PROP_UTF8ONLY: chosen_utf = (int)intval; break; case ZFS_PROP_NORMALIZE: chosen_normal = (int)intval; break; default: break; } /* * For changes to existing volumes, we have some additional * checks to enforce. */ if (type == ZFS_TYPE_VOLUME && zhp != NULL) { uint64_t blocksize = zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE); char buf[64]; switch (prop) { case ZFS_PROP_VOLSIZE: if (intval % blocksize != 0) { zfs_nicebytes(blocksize, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a multiple of " "volume block size (%s)"), propname, buf); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (intval == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be zero"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; default: break; } } /* check encryption properties */ if (zhp != NULL) { int64_t crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); switch (prop) { case ZFS_PROP_COPIES: if (crypt != ZIO_CRYPT_OFF && intval > 2) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encrypted datasets cannot have " "3 copies")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; default: break; } } } /* * If normalization was chosen, but no UTF8 choice was made, * enforce rejection of non-UTF8 names. * * If normalization was chosen, but rejecting non-UTF8 names * was explicitly not chosen, it is an error. */ if (chosen_normal > 0 && chosen_utf < 0) { if (nvlist_add_uint64(ret, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), 1) != 0) { (void) no_memory(hdl); goto error; } } else if (chosen_normal > 0 && chosen_utf == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be set 'on' if normalization chosen"), zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } return (ret); error: nvlist_free(ret); return (NULL); } static int zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) { uint64_t old_volsize; uint64_t new_volsize; uint64_t old_reservation; uint64_t new_reservation; zfs_prop_t resv_prop; nvlist_t *props; zpool_handle_t *zph = zpool_handle(zhp); /* * If this is an existing volume, and someone is setting the volsize, * make sure that it matches the reservation, or add it if necessary. */ old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); if (zfs_which_resv_prop(zhp, &resv_prop) < 0) return (-1); old_reservation = zfs_prop_get_int(zhp, resv_prop); props = fnvlist_alloc(); fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); if ((zvol_volsize_to_reservation(zph, old_volsize, props) != old_reservation) || nvlist_exists(nvl, zfs_prop_to_name(resv_prop))) { fnvlist_free(props); return (0); } if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &new_volsize) != 0) { fnvlist_free(props); return (-1); } new_reservation = zvol_volsize_to_reservation(zph, new_volsize, props); fnvlist_free(props); if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop), new_reservation) != 0) { (void) no_memory(zhp->zfs_hdl); return (-1); } return (1); } /* * Helper for 'zfs {set|clone} refreservation=auto'. Must be called after * zfs_valid_proplist(), as it is what sets the UINT64_MAX sentinel value. * Return codes must match zfs_add_synthetic_resv(). */ static int zfs_fix_auto_resv(zfs_handle_t *zhp, nvlist_t *nvl) { uint64_t volsize; uint64_t resvsize; zfs_prop_t prop; nvlist_t *props; if (!ZFS_IS_VOLUME(zhp)) { return (0); } if (zfs_which_resv_prop(zhp, &prop) != 0) { return (-1); } if (prop != ZFS_PROP_REFRESERVATION) { return (0); } if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(prop), &resvsize) != 0) { /* No value being set, so it can't be "auto" */ return (0); } if (resvsize != UINT64_MAX) { /* Being set to a value other than "auto" */ return (0); } props = fnvlist_alloc(); fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) { volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); } resvsize = zvol_volsize_to_reservation(zpool_handle(zhp), volsize, props); fnvlist_free(props); (void) nvlist_remove_all(nvl, zfs_prop_to_name(prop)); if (nvlist_add_uint64(nvl, zfs_prop_to_name(prop), resvsize) != 0) { (void) no_memory(zhp->zfs_hdl); return (-1); } return (1); } static boolean_t zfs_is_namespace_prop(zfs_prop_t prop) { switch (prop) { case ZFS_PROP_ATIME: case ZFS_PROP_RELATIME: case ZFS_PROP_DEVICES: case ZFS_PROP_EXEC: case ZFS_PROP_SETUID: case ZFS_PROP_READONLY: case ZFS_PROP_XATTR: case ZFS_PROP_NBMAND: return (B_TRUE); default: return (B_FALSE); } } /* * Given a property name and value, set the property for the given dataset. */ int zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) { int ret = -1; char errbuf[1024]; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *nvl = NULL; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), zhp->zfs_name); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_string(nvl, propname, propval) != 0) { (void) no_memory(hdl); goto error; } ret = zfs_prop_set_list(zhp, nvl); error: nvlist_free(nvl); return (ret); } /* * Given an nvlist of property names and values, set the properties for the * given dataset. */ int zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props) { zfs_cmd_t zc = {"\0"}; int ret = -1; prop_changelist_t **cls = NULL; int cl_idx; char errbuf[1024]; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *nvl; int nvl_len = 0; int added_resv = 0; zfs_prop_t prop = 0; nvpair_t *elem; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), zhp->zfs_name); if ((nvl = zfs_valid_proplist(hdl, zhp->zfs_type, props, zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, zhp->zpool_hdl, B_FALSE, errbuf)) == NULL) goto error; /* * We have to check for any extra properties which need to be added * before computing the length of the nvlist. */ for (elem = nvlist_next_nvpair(nvl, NULL); elem != NULL; elem = nvlist_next_nvpair(nvl, elem)) { if (zfs_name_to_prop(nvpair_name(elem)) == ZFS_PROP_VOLSIZE && (added_resv = zfs_add_synthetic_resv(zhp, nvl)) == -1) { goto error; } } if (added_resv != 1 && (added_resv = zfs_fix_auto_resv(zhp, nvl)) == -1) { goto error; } /* * Check how many properties we're setting and allocate an array to * store changelist pointers for postfix(). */ for (elem = nvlist_next_nvpair(nvl, NULL); elem != NULL; elem = nvlist_next_nvpair(nvl, elem)) nvl_len++; if ((cls = calloc(nvl_len, sizeof (prop_changelist_t *))) == NULL) goto error; cl_idx = 0; for (elem = nvlist_next_nvpair(nvl, NULL); elem != NULL; elem = nvlist_next_nvpair(nvl, elem)) { prop = zfs_name_to_prop(nvpair_name(elem)); assert(cl_idx < nvl_len); /* * We don't want to unmount & remount the dataset when changing * its canmount property to 'on' or 'noauto'. We only use * the changelist logic to unmount when setting canmount=off. */ if (prop != ZFS_PROP_CANMOUNT || (fnvpair_value_uint64(elem) == ZFS_CANMOUNT_OFF && zfs_is_mounted(zhp, NULL))) { cls[cl_idx] = changelist_gather(zhp, prop, 0, 0); if (cls[cl_idx] == NULL) goto error; } if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cls[cl_idx])) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "child dataset with inherited mountpoint is used " "in a non-global zone")); ret = zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } if (cls[cl_idx] != NULL && (ret = changelist_prefix(cls[cl_idx])) != 0) goto error; cl_idx++; } assert(cl_idx == nvl_len); /* * Execute the corresponding ioctl() to set this list of properties. */ (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if ((ret = zcmd_write_src_nvlist(hdl, &zc, nvl)) != 0 || (ret = zcmd_alloc_dst_nvlist(hdl, &zc, 0)) != 0) goto error; ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); if (ret != 0) { if (zc.zc_nvlist_dst_filled == B_FALSE) { (void) zfs_standard_error(hdl, errno, errbuf); goto error; } /* Get the list of unset properties back and report them. */ nvlist_t *errorprops = NULL; if (zcmd_read_dst_nvlist(hdl, &zc, &errorprops) != 0) goto error; for (nvpair_t *elem = nvlist_next_nvpair(errorprops, NULL); elem != NULL; elem = nvlist_next_nvpair(errorprops, elem)) { prop = zfs_name_to_prop(nvpair_name(elem)); zfs_setprop_error(hdl, prop, errno, errbuf); } nvlist_free(errorprops); if (added_resv && errno == ENOSPC) { /* clean up the volsize property we tried to set */ uint64_t old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); nvlist_free(nvl); nvl = NULL; zcmd_free_nvlists(&zc); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) goto error; if (nvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), old_volsize) != 0) goto error; if (zcmd_write_src_nvlist(hdl, &zc, nvl) != 0) goto error; (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); } } else { for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) { if (cls[cl_idx] != NULL) { int clp_err = changelist_postfix(cls[cl_idx]); if (clp_err != 0) ret = clp_err; } } if (ret == 0) { /* * Refresh the statistics so the new property * value is reflected. */ (void) get_stats(zhp); /* * Remount the filesystem to propagate the change * if one of the options handled by the generic * Linux namespace layer has been modified. */ if (zfs_is_namespace_prop(prop) && zfs_is_mounted(zhp, NULL)) ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0); } } error: nvlist_free(nvl); zcmd_free_nvlists(&zc); if (cls != NULL) { for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) { if (cls[cl_idx] != NULL) changelist_free(cls[cl_idx]); } free(cls); } return (ret); } /* * Given a property, inherit the value from the parent dataset, or if received * is TRUE, revert to the received value, if any. */ int zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received) { zfs_cmd_t zc = {"\0"}; int ret; prop_changelist_t *cl; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[1024]; zfs_prop_t prop; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot inherit %s for '%s'"), propname, zhp->zfs_name); zc.zc_cookie = received; if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL) { /* * For user properties, the amount of work we have to do is very * small, so just do it here. */ if (!zfs_prop_user(propname)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value)); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc) != 0) return (zfs_standard_error(hdl, errno, errbuf)); return (0); } /* * Verify that this property is inheritable. */ if (zfs_prop_readonly(prop)) return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf)); if (!zfs_prop_inheritable(prop) && !received) return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf)); /* * Check to see if the value applies to this type */ if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) return (zfs_error(hdl, EZFS_PROPTYPE, errbuf)); /* * Normalize the name, to get rid of shorthand abbreviations. */ propname = zfs_prop_to_name(prop); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value)); if (prop == ZFS_PROP_MOUNTPOINT && getzoneid() == GLOBAL_ZONEID && zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset is used in a non-global zone")); return (zfs_error(hdl, EZFS_ZONED, errbuf)); } /* * Determine datasets which will be affected by this change, if any. */ if ((cl = changelist_gather(zhp, prop, 0, 0)) == NULL) return (-1); if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "child dataset with inherited mountpoint is used " "in a non-global zone")); ret = zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } if ((ret = changelist_prefix(cl)) != 0) goto error; if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc)) != 0) { return (zfs_standard_error(hdl, errno, errbuf)); } else { if ((ret = changelist_postfix(cl)) != 0) goto error; /* * Refresh the statistics so the new property is reflected. */ (void) get_stats(zhp); /* * Remount the filesystem to propagate the change * if one of the options handled by the generic * Linux namespace layer has been modified. */ if (zfs_is_namespace_prop(prop) && zfs_is_mounted(zhp, NULL)) ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0); } error: changelist_free(cl); return (ret); } /* * True DSL properties are stored in an nvlist. The following two functions * extract them appropriately. */ uint64_t getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, char **source) { nvlist_t *nv; uint64_t value; *source = NULL; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(prop), &nv) == 0) { verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0); (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); } else { verify(!zhp->zfs_props_table || zhp->zfs_props_table[prop] == B_TRUE); value = zfs_prop_default_numeric(prop); *source = ""; } return (value); } static const char * getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source) { nvlist_t *nv; const char *value; *source = NULL; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(prop), &nv) == 0) { value = fnvlist_lookup_string(nv, ZPROP_VALUE); (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); } else { verify(!zhp->zfs_props_table || zhp->zfs_props_table[prop] == B_TRUE); value = zfs_prop_default_string(prop); *source = ""; } return (value); } static boolean_t zfs_is_recvd_props_mode(zfs_handle_t *zhp) { return (zhp->zfs_props == zhp->zfs_recvd_props); } static void zfs_set_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie) { *cookie = (uint64_t)(uintptr_t)zhp->zfs_props; zhp->zfs_props = zhp->zfs_recvd_props; } static void zfs_unset_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie) { zhp->zfs_props = (nvlist_t *)(uintptr_t)*cookie; *cookie = 0; } /* * Internal function for getting a numeric property. Both zfs_prop_get() and * zfs_prop_get_int() are built using this interface. * * Certain properties can be overridden using 'mount -o'. In this case, scan * the contents of the /proc/self/mounts entry, searching for the * appropriate options. If they differ from the on-disk values, report the * current values and mark the source "temporary". */ static int get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, char **source, uint64_t *val) { zfs_cmd_t zc = {"\0"}; nvlist_t *zplprops = NULL; struct mnttab mnt; char *mntopt_on = NULL; char *mntopt_off = NULL; boolean_t received = zfs_is_recvd_props_mode(zhp); *source = NULL; /* * If the property is being fetched for a snapshot, check whether * the property is valid for the snapshot's head dataset type. */ if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT && !zfs_prop_valid_for_type(prop, zhp->zfs_head_type, B_TRUE)) { *val = zfs_prop_default_numeric(prop); return (-1); } switch (prop) { case ZFS_PROP_ATIME: mntopt_on = MNTOPT_ATIME; mntopt_off = MNTOPT_NOATIME; break; case ZFS_PROP_RELATIME: mntopt_on = MNTOPT_RELATIME; mntopt_off = MNTOPT_NORELATIME; break; case ZFS_PROP_DEVICES: mntopt_on = MNTOPT_DEVICES; mntopt_off = MNTOPT_NODEVICES; break; case ZFS_PROP_EXEC: mntopt_on = MNTOPT_EXEC; mntopt_off = MNTOPT_NOEXEC; break; case ZFS_PROP_READONLY: mntopt_on = MNTOPT_RO; mntopt_off = MNTOPT_RW; break; case ZFS_PROP_SETUID: mntopt_on = MNTOPT_SETUID; mntopt_off = MNTOPT_NOSETUID; break; case ZFS_PROP_XATTR: mntopt_on = MNTOPT_XATTR; mntopt_off = MNTOPT_NOXATTR; break; case ZFS_PROP_NBMAND: mntopt_on = MNTOPT_NBMAND; mntopt_off = MNTOPT_NONBMAND; break; default: break; } /* * Because looking up the mount options is potentially expensive * (iterating over all of /proc/self/mounts), we defer its * calculation until we're looking up a property which requires * its presence. */ if (!zhp->zfs_mntcheck && (mntopt_on != NULL || prop == ZFS_PROP_MOUNTED)) { libzfs_handle_t *hdl = zhp->zfs_hdl; struct mnttab entry; if (libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0) { zhp->zfs_mntopts = zfs_strdup(hdl, entry.mnt_mntopts); if (zhp->zfs_mntopts == NULL) return (-1); } zhp->zfs_mntcheck = B_TRUE; } if (zhp->zfs_mntopts == NULL) mnt.mnt_mntopts = ""; else mnt.mnt_mntopts = zhp->zfs_mntopts; switch (prop) { case ZFS_PROP_ATIME: case ZFS_PROP_RELATIME: case ZFS_PROP_DEVICES: case ZFS_PROP_EXEC: case ZFS_PROP_READONLY: case ZFS_PROP_SETUID: #ifndef __FreeBSD__ case ZFS_PROP_XATTR: #endif case ZFS_PROP_NBMAND: *val = getprop_uint64(zhp, prop, source); if (received) break; if (hasmntopt(&mnt, mntopt_on) && !*val) { *val = B_TRUE; if (src) *src = ZPROP_SRC_TEMPORARY; } else if (hasmntopt(&mnt, mntopt_off) && *val) { *val = B_FALSE; if (src) *src = ZPROP_SRC_TEMPORARY; } break; case ZFS_PROP_CANMOUNT: case ZFS_PROP_VOLSIZE: case ZFS_PROP_QUOTA: case ZFS_PROP_REFQUOTA: case ZFS_PROP_RESERVATION: case ZFS_PROP_REFRESERVATION: case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: case ZFS_PROP_FILESYSTEM_COUNT: case ZFS_PROP_SNAPSHOT_COUNT: *val = getprop_uint64(zhp, prop, source); if (*source == NULL) { /* not default, must be local */ *source = zhp->zfs_name; } break; case ZFS_PROP_MOUNTED: *val = (zhp->zfs_mntopts != NULL); break; case ZFS_PROP_NUMCLONES: *val = zhp->zfs_dmustats.dds_num_clones; break; case ZFS_PROP_VERSION: case ZFS_PROP_NORMALIZE: case ZFS_PROP_UTF8ONLY: case ZFS_PROP_CASE: if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) return (-1); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) { zcmd_free_nvlists(&zc); if (prop == ZFS_PROP_VERSION && zhp->zfs_type == ZFS_TYPE_VOLUME) *val = zfs_prop_default_numeric(prop); return (-1); } if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 || nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop), val) != 0) { zcmd_free_nvlists(&zc); return (-1); } nvlist_free(zplprops); zcmd_free_nvlists(&zc); break; case ZFS_PROP_INCONSISTENT: *val = zhp->zfs_dmustats.dds_inconsistent; break; case ZFS_PROP_REDACTED: *val = zhp->zfs_dmustats.dds_redacted; break; default: switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: *val = getprop_uint64(zhp, prop, source); /* * If we tried to use a default value for a * readonly property, it means that it was not * present. Note this only applies to "truly" * readonly properties, not set-once properties * like volblocksize. */ if (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop) && *source != NULL && (*source)[0] == '\0') { *source = NULL; return (-1); } break; case PROP_TYPE_STRING: default: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "cannot get non-numeric property")); return (zfs_error(zhp->zfs_hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN, "internal error"))); } } return (0); } /* * Calculate the source type, given the raw source string. */ static void get_source(zfs_handle_t *zhp, zprop_source_t *srctype, char *source, char *statbuf, size_t statlen) { if (statbuf == NULL || srctype == NULL || *srctype == ZPROP_SRC_TEMPORARY) { return; } if (source == NULL) { *srctype = ZPROP_SRC_NONE; } else if (source[0] == '\0') { *srctype = ZPROP_SRC_DEFAULT; } else if (strstr(source, ZPROP_SOURCE_VAL_RECVD) != NULL) { *srctype = ZPROP_SRC_RECEIVED; } else { if (strcmp(source, zhp->zfs_name) == 0) { *srctype = ZPROP_SRC_LOCAL; } else { (void) strlcpy(statbuf, source, statlen); *srctype = ZPROP_SRC_INHERITED; } } } int zfs_prop_get_recvd(zfs_handle_t *zhp, const char *propname, char *propbuf, size_t proplen, boolean_t literal) { zfs_prop_t prop; int err = 0; if (zhp->zfs_recvd_props == NULL) if (get_recvd_props_ioctl(zhp) != 0) return (-1); prop = zfs_name_to_prop(propname); if (prop != ZPROP_INVAL) { uint64_t cookie; if (!nvlist_exists(zhp->zfs_recvd_props, propname)) return (-1); zfs_set_recvd_props_mode(zhp, &cookie); err = zfs_prop_get(zhp, prop, propbuf, proplen, NULL, NULL, 0, literal); zfs_unset_recvd_props_mode(zhp, &cookie); } else { nvlist_t *propval; char *recvdval; if (nvlist_lookup_nvlist(zhp->zfs_recvd_props, propname, &propval) != 0) return (-1); verify(nvlist_lookup_string(propval, ZPROP_VALUE, &recvdval) == 0); (void) strlcpy(propbuf, recvdval, proplen); } return (err == 0 ? 0 : -1); } static int get_clones_string(zfs_handle_t *zhp, char *propbuf, size_t proplen) { nvlist_t *value; nvpair_t *pair; value = zfs_get_clones_nvl(zhp); if (value == NULL) return (-1); propbuf[0] = '\0'; for (pair = nvlist_next_nvpair(value, NULL); pair != NULL; pair = nvlist_next_nvpair(value, pair)) { if (propbuf[0] != '\0') (void) strlcat(propbuf, ",", proplen); (void) strlcat(propbuf, nvpair_name(pair), proplen); } return (0); } struct get_clones_arg { uint64_t numclones; nvlist_t *value; const char *origin; char buf[ZFS_MAX_DATASET_NAME_LEN]; }; static int get_clones_cb(zfs_handle_t *zhp, void *arg) { struct get_clones_arg *gca = arg; if (gca->numclones == 0) { zfs_close(zhp); return (0); } if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, gca->buf, sizeof (gca->buf), NULL, NULL, 0, B_TRUE) != 0) goto out; if (strcmp(gca->buf, gca->origin) == 0) { fnvlist_add_boolean(gca->value, zfs_get_name(zhp)); gca->numclones--; } out: (void) zfs_iter_children(zhp, get_clones_cb, gca); zfs_close(zhp); return (0); } nvlist_t * zfs_get_clones_nvl(zfs_handle_t *zhp) { nvlist_t *nv, *value; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_CLONES), &nv) != 0) { struct get_clones_arg gca; /* * if this is a snapshot, then the kernel wasn't able * to get the clones. Do it by slowly iterating. */ if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) return (NULL); if (nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) != 0) return (NULL); if (nvlist_alloc(&value, NV_UNIQUE_NAME, 0) != 0) { nvlist_free(nv); return (NULL); } gca.numclones = zfs_prop_get_int(zhp, ZFS_PROP_NUMCLONES); gca.value = value; gca.origin = zhp->zfs_name; if (gca.numclones != 0) { zfs_handle_t *root; char pool[ZFS_MAX_DATASET_NAME_LEN]; char *cp = pool; /* get the pool name */ (void) strlcpy(pool, zhp->zfs_name, sizeof (pool)); (void) strsep(&cp, "/@"); root = zfs_open(zhp->zfs_hdl, pool, ZFS_TYPE_FILESYSTEM); if (root == NULL) { nvlist_free(nv); nvlist_free(value); return (NULL); } (void) get_clones_cb(root, &gca); } if (gca.numclones != 0 || nvlist_add_nvlist(nv, ZPROP_VALUE, value) != 0 || nvlist_add_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_CLONES), nv) != 0) { nvlist_free(nv); nvlist_free(value); return (NULL); } nvlist_free(nv); nvlist_free(value); verify(0 == nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_CLONES), &nv)); } verify(nvlist_lookup_nvlist(nv, ZPROP_VALUE, &value) == 0); return (value); } static int get_rsnaps_string(zfs_handle_t *zhp, char *propbuf, size_t proplen) { nvlist_t *value; uint64_t *snaps; uint_t nsnaps; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &value) != 0) return (-1); if (nvlist_lookup_uint64_array(value, ZPROP_VALUE, &snaps, &nsnaps) != 0) return (-1); if (nsnaps == 0) { /* There's no redaction snapshots; pass a special value back */ (void) snprintf(propbuf, proplen, "none"); return (0); } propbuf[0] = '\0'; for (int i = 0; i < nsnaps; i++) { char buf[128]; if (propbuf[0] != '\0') (void) strlcat(propbuf, ",", proplen); (void) snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)snaps[i]); (void) strlcat(propbuf, buf, proplen); } return (0); } /* * Accepts a property and value and checks that the value * matches the one found by the channel program. If they are * not equal, print both of them. */ static void zcp_check(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t intval, const char *strval) { if (!zhp->zfs_hdl->libzfs_prop_debug) return; int error; char *poolname = zhp->zpool_hdl->zpool_name; const char *prop_name = zfs_prop_to_name(prop); const char *program = "args = ...\n" "ds = args['dataset']\n" "prop = args['property']\n" "value, setpoint = zfs.get_prop(ds, prop)\n" "return {value=value, setpoint=setpoint}\n"; nvlist_t *outnvl; nvlist_t *retnvl; nvlist_t *argnvl = fnvlist_alloc(); fnvlist_add_string(argnvl, "dataset", zhp->zfs_name); fnvlist_add_string(argnvl, "property", zfs_prop_to_name(prop)); error = lzc_channel_program_nosync(poolname, program, 10 * 1000 * 1000, 10 * 1024 * 1024, argnvl, &outnvl); if (error == 0) { retnvl = fnvlist_lookup_nvlist(outnvl, "return"); if (zfs_prop_get_type(prop) == PROP_TYPE_NUMBER) { int64_t ans; error = nvlist_lookup_int64(retnvl, "value", &ans); if (error != 0) { (void) fprintf(stderr, "%s: zcp check error: " "%u\n", prop_name, error); return; } if (ans != intval) { (void) fprintf(stderr, "%s: zfs found %llu, " "but zcp found %llu\n", prop_name, (u_longlong_t)intval, (u_longlong_t)ans); } } else { char *str_ans; error = nvlist_lookup_string(retnvl, "value", &str_ans); if (error != 0) { (void) fprintf(stderr, "%s: zcp check error: " "%u\n", prop_name, error); return; } if (strcmp(strval, str_ans) != 0) { (void) fprintf(stderr, "%s: zfs found '%s', but zcp found '%s'\n", prop_name, strval, str_ans); } } } else { (void) fprintf(stderr, "%s: zcp check failed, channel program " "error: %u\n", prop_name, error); } nvlist_free(argnvl); nvlist_free(outnvl); } /* * Retrieve a property from the given object. If 'literal' is specified, then * numbers are left as exact values. Otherwise, numbers are converted to a * human-readable form. * * Returns 0 on success, or -1 on error. */ int zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, zprop_source_t *src, char *statbuf, size_t statlen, boolean_t literal) { char *source = NULL; uint64_t val; const char *str; const char *strval; boolean_t received = zfs_is_recvd_props_mode(zhp); /* * Check to see if this property applies to our object */ if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) return (-1); if (received && zfs_prop_readonly(prop)) return (-1); if (src) *src = ZPROP_SRC_NONE; switch (prop) { case ZFS_PROP_CREATION: /* * 'creation' is a time_t stored in the statistics. We convert * this into a string unless 'literal' is specified. */ { val = getprop_uint64(zhp, prop, &source); time_t time = (time_t)val; struct tm t; if (literal || localtime_r(&time, &t) == NULL || strftime(propbuf, proplen, "%a %b %e %k:%M %Y", &t) == 0) (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_MOUNTPOINT: /* * Getting the precise mountpoint can be tricky. * * - for 'none' or 'legacy', return those values. * - for inherited mountpoints, we want to take everything * after our ancestor and append it to the inherited value. * * If the pool has an alternate root, we want to prepend that * root to any values we return. */ str = getprop_string(zhp, prop, &source); if (str[0] == '/') { char buf[MAXPATHLEN]; char *root = buf; const char *relpath; /* * If we inherit the mountpoint, even from a dataset * with a received value, the source will be the path of * the dataset we inherit from. If source is * ZPROP_SOURCE_VAL_RECVD, the received value is not * inherited. */ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) { relpath = ""; } else { relpath = zhp->zfs_name + strlen(source); if (relpath[0] == '/') relpath++; } if ((zpool_get_prop(zhp->zpool_hdl, ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL, B_FALSE)) || (strcmp(root, "-") == 0)) root[0] = '\0'; /* * Special case an alternate root of '/'. This will * avoid having multiple leading slashes in the * mountpoint path. */ if (strcmp(root, "/") == 0) root++; /* * If the mountpoint is '/' then skip over this * if we are obtaining either an alternate root or * an inherited mountpoint. */ if (str[1] == '\0' && (root[0] != '\0' || relpath[0] != '\0')) str++; if (relpath[0] == '\0') (void) snprintf(propbuf, proplen, "%s%s", root, str); else (void) snprintf(propbuf, proplen, "%s%s%s%s", root, str, relpath[0] == '@' ? "" : "/", relpath); } else { /* 'legacy' or 'none' */ (void) strlcpy(propbuf, str, proplen); } zcp_check(zhp, prop, 0, propbuf); break; case ZFS_PROP_ORIGIN: str = getprop_string(zhp, prop, &source); if (str == NULL) return (-1); (void) strlcpy(propbuf, str, proplen); zcp_check(zhp, prop, 0, str); break; case ZFS_PROP_REDACT_SNAPS: if (get_rsnaps_string(zhp, propbuf, proplen) != 0) return (-1); break; case ZFS_PROP_CLONES: if (get_clones_string(zhp, propbuf, proplen) != 0) return (-1); break; case ZFS_PROP_QUOTA: case ZFS_PROP_REFQUOTA: case ZFS_PROP_RESERVATION: case ZFS_PROP_REFRESERVATION: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); /* * If quota or reservation is 0, we translate this into 'none' * (unless literal is set), and indicate that it's the default * value. Otherwise, we print the number nicely and indicate * that its set locally. */ if (val == 0) { if (literal) (void) strlcpy(propbuf, "0", proplen); else (void) strlcpy(propbuf, "none", proplen); } else { if (literal) (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); else zfs_nicebytes(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: case ZFS_PROP_FILESYSTEM_COUNT: case ZFS_PROP_SNAPSHOT_COUNT: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); /* * If limit is UINT64_MAX, we translate this into 'none' (unless * literal is set), and indicate that it's the default value. * Otherwise, we print the number nicely and indicate that it's * set locally. */ if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } else if (val == UINT64_MAX) { (void) strlcpy(propbuf, "none", proplen); } else { zfs_nicenum(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_REFRATIO: case ZFS_PROP_COMPRESSRATIO: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); if (literal) (void) snprintf(propbuf, proplen, "%llu.%02llu", (u_longlong_t)(val / 100), (u_longlong_t)(val % 100)); else (void) snprintf(propbuf, proplen, "%llu.%02llux", (u_longlong_t)(val / 100), (u_longlong_t)(val % 100)); zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_TYPE: switch (zhp->zfs_type) { case ZFS_TYPE_FILESYSTEM: str = "filesystem"; break; case ZFS_TYPE_VOLUME: str = "volume"; break; case ZFS_TYPE_SNAPSHOT: str = "snapshot"; break; case ZFS_TYPE_BOOKMARK: str = "bookmark"; break; default: abort(); } (void) snprintf(propbuf, proplen, "%s", str); zcp_check(zhp, prop, 0, propbuf); break; case ZFS_PROP_MOUNTED: /* * The 'mounted' property is a pseudo-property that described * whether the filesystem is currently mounted. Even though * it's a boolean value, the typical values of "on" and "off" * don't make sense, so we translate to "yes" and "no". */ if (get_numeric_property(zhp, ZFS_PROP_MOUNTED, src, &source, &val) != 0) return (-1); if (val) (void) strlcpy(propbuf, "yes", proplen); else (void) strlcpy(propbuf, "no", proplen); break; case ZFS_PROP_NAME: /* * The 'name' property is a pseudo-property derived from the * dataset name. It is presented as a real property to simplify * consumers. */ (void) strlcpy(propbuf, zhp->zfs_name, proplen); zcp_check(zhp, prop, 0, propbuf); break; case ZFS_PROP_MLSLABEL: { #ifdef HAVE_MLSLABEL m_label_t *new_sl = NULL; char *ascii = NULL; /* human readable label */ (void) strlcpy(propbuf, getprop_string(zhp, prop, &source), proplen); if (literal || (strcasecmp(propbuf, ZFS_MLSLABEL_DEFAULT) == 0)) break; /* * Try to translate the internal hex string to * human-readable output. If there are any * problems just use the hex string. */ if (str_to_label(propbuf, &new_sl, MAC_LABEL, L_NO_CORRECTION, NULL) == -1) { m_label_free(new_sl); break; } if (label_to_str(new_sl, &ascii, M_LABEL, DEF_NAMES) != 0) { if (ascii) free(ascii); m_label_free(new_sl); break; } m_label_free(new_sl); (void) strlcpy(propbuf, ascii, proplen); free(ascii); #else (void) strlcpy(propbuf, getprop_string(zhp, prop, &source), proplen); #endif /* HAVE_MLSLABEL */ } break; case ZFS_PROP_GUID: case ZFS_PROP_CREATETXG: case ZFS_PROP_OBJSETID: /* * These properties are stored as numbers, but they are * identifiers. * We don't want them to be pretty printed, because pretty * printing mangles the ID into a truncated and useless value. */ if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_REFERENCED: case ZFS_PROP_AVAILABLE: case ZFS_PROP_USED: case ZFS_PROP_USEDSNAP: case ZFS_PROP_USEDDS: case ZFS_PROP_USEDREFRESERV: case ZFS_PROP_USEDCHILD: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } else { zfs_nicebytes(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; default: switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) { return (-1); } if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } else { zfs_nicenum(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; case PROP_TYPE_STRING: str = getprop_string(zhp, prop, &source); if (str == NULL) return (-1); (void) strlcpy(propbuf, str, proplen); zcp_check(zhp, prop, 0, str); break; case PROP_TYPE_INDEX: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); if (zfs_prop_index_to_string(prop, val, &strval) != 0) return (-1); (void) strlcpy(propbuf, strval, proplen); zcp_check(zhp, prop, 0, strval); break; default: abort(); } } get_source(zhp, src, source, statbuf, statlen); return (0); } /* * Utility function to get the given numeric property. Does no validation that * the given property is the appropriate type; should only be used with * hard-coded property types. */ uint64_t zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop) { char *source; uint64_t val = 0; (void) get_numeric_property(zhp, prop, NULL, &source, &val); return (val); } static int zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val) { char buf[64]; (void) snprintf(buf, sizeof (buf), "%llu", (longlong_t)val); return (zfs_prop_set(zhp, zfs_prop_to_name(prop), buf)); } /* * Similar to zfs_prop_get(), but returns the value as an integer. */ int zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value, zprop_source_t *src, char *statbuf, size_t statlen) { char *source; /* * Check to see if this property applies to our object */ if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) { return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE, dgettext(TEXT_DOMAIN, "cannot get property '%s'"), zfs_prop_to_name(prop))); } if (src) *src = ZPROP_SRC_NONE; if (get_numeric_property(zhp, prop, src, &source, value) != 0) return (-1); get_source(zhp, src, source, statbuf, statlen); return (0); } #ifdef HAVE_IDMAP static int idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser, char **domainp, idmap_rid_t *ridp) { idmap_get_handle_t *get_hdl = NULL; idmap_stat status; int err = EINVAL; if (idmap_get_create(&get_hdl) != IDMAP_SUCCESS) goto out; if (isuser) { err = idmap_get_sidbyuid(get_hdl, id, IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); } else { err = idmap_get_sidbygid(get_hdl, id, IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); } if (err == IDMAP_SUCCESS && idmap_get_mappings(get_hdl) == IDMAP_SUCCESS && status == IDMAP_SUCCESS) err = 0; else err = EINVAL; out: if (get_hdl) idmap_get_destroy(get_hdl); return (err); } #endif /* HAVE_IDMAP */ /* * convert the propname into parameters needed by kernel * Eg: userquota@ahrens -> ZFS_PROP_USERQUOTA, "", 126829 * Eg: userused@matt@domain -> ZFS_PROP_USERUSED, "S-1-123-456", 789 * Eg: groupquota@staff -> ZFS_PROP_GROUPQUOTA, "", 1234 * Eg: groupused@staff -> ZFS_PROP_GROUPUSED, "", 1234 * Eg: projectquota@123 -> ZFS_PROP_PROJECTQUOTA, "", 123 * Eg: projectused@789 -> ZFS_PROP_PROJECTUSED, "", 789 */ static int userquota_propname_decode(const char *propname, boolean_t zoned, zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp) { zfs_userquota_prop_t type; char *cp; boolean_t isuser; boolean_t isgroup; boolean_t isproject; struct passwd *pw; struct group *gr; domain[0] = '\0'; /* Figure out the property type ({user|group|project}{quota|space}) */ for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) { if (strncmp(propname, zfs_userquota_prop_prefixes[type], strlen(zfs_userquota_prop_prefixes[type])) == 0) break; } if (type == ZFS_NUM_USERQUOTA_PROPS) return (EINVAL); *typep = type; isuser = (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_USERUSED || type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_USEROBJUSED); isgroup = (type == ZFS_PROP_GROUPQUOTA || type == ZFS_PROP_GROUPUSED || type == ZFS_PROP_GROUPOBJQUOTA || type == ZFS_PROP_GROUPOBJUSED); isproject = (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTOBJQUOTA || type == ZFS_PROP_PROJECTOBJUSED); cp = strchr(propname, '@') + 1; if (isuser && (pw = getpwnam(cp)) != NULL) { if (zoned && getzoneid() == GLOBAL_ZONEID) return (ENOENT); *ridp = pw->pw_uid; } else if (isgroup && (gr = getgrnam(cp)) != NULL) { if (zoned && getzoneid() == GLOBAL_ZONEID) return (ENOENT); *ridp = gr->gr_gid; } else if (!isproject && strchr(cp, '@')) { #ifdef HAVE_IDMAP /* * It's a SID name (eg "user@domain") that needs to be * turned into S-1-domainID-RID. */ directory_error_t e; char *numericsid = NULL; char *end; if (zoned && getzoneid() == GLOBAL_ZONEID) return (ENOENT); if (isuser) { e = directory_sid_from_user_name(NULL, cp, &numericsid); } else { e = directory_sid_from_group_name(NULL, cp, &numericsid); } if (e != NULL) { directory_error_free(e); return (ENOENT); } if (numericsid == NULL) return (ENOENT); cp = numericsid; (void) strlcpy(domain, cp, domainlen); cp = strrchr(domain, '-'); *cp = '\0'; cp++; errno = 0; *ridp = strtoull(cp, &end, 10); free(numericsid); if (errno != 0 || *end != '\0') return (EINVAL); #else return (ENOSYS); #endif /* HAVE_IDMAP */ } else { /* It's a user/group/project ID (eg "12345"). */ uid_t id; char *end; id = strtoul(cp, &end, 10); if (*end != '\0') return (EINVAL); if (id > MAXUID && !isproject) { #ifdef HAVE_IDMAP /* It's an ephemeral ID. */ idmap_rid_t rid; char *mapdomain; if (idmap_id_to_numeric_domain_rid(id, isuser, &mapdomain, &rid) != 0) return (ENOENT); (void) strlcpy(domain, mapdomain, domainlen); *ridp = rid; #else return (ENOSYS); #endif /* HAVE_IDMAP */ } else { *ridp = id; } } return (0); } static int zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue, zfs_userquota_prop_t *typep) { int err; zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); err = userquota_propname_decode(propname, zfs_prop_get_int(zhp, ZFS_PROP_ZONED), typep, zc.zc_value, sizeof (zc.zc_value), &zc.zc_guid); zc.zc_objset_type = *typep; if (err) return (err); err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_USERSPACE_ONE, &zc); if (err) return (err); *propvalue = zc.zc_cookie; return (0); } int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue) { zfs_userquota_prop_t type; return (zfs_prop_get_userquota_common(zhp, propname, propvalue, &type)); } int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal) { int err; uint64_t propvalue; zfs_userquota_prop_t type; err = zfs_prop_get_userquota_common(zhp, propname, &propvalue, &type); if (err) return (err); if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)propvalue); } else if (propvalue == 0 && (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA || type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTOBJQUOTA)) { (void) strlcpy(propbuf, "none", proplen); } else if (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA || type == ZFS_PROP_USERUSED || type == ZFS_PROP_GROUPUSED || type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTQUOTA) { zfs_nicebytes(propvalue, propbuf, proplen); } else { zfs_nicenum(propvalue, propbuf, proplen); } return (0); } /* * propname must start with "written@" or "written#". */ int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue) { int err; zfs_cmd_t zc = {"\0"}; const char *snapname; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); assert(zfs_prop_written(propname)); snapname = propname + strlen("written@"); if (strchr(snapname, '@') != NULL || strchr(snapname, '#') != NULL) { /* full snapshot or bookmark name specified */ (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); } else { /* snapname is the short name, append it to zhp's fsname */ char *cp; (void) strlcpy(zc.zc_value, zhp->zfs_name, sizeof (zc.zc_value)); cp = strchr(zc.zc_value, '@'); if (cp != NULL) *cp = '\0'; (void) strlcat(zc.zc_value, snapname - 1, sizeof (zc.zc_value)); } err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SPACE_WRITTEN, &zc); if (err) return (err); *propvalue = zc.zc_cookie; return (0); } int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal) { int err; uint64_t propvalue; err = zfs_prop_get_written_int(zhp, propname, &propvalue); if (err) return (err); if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)propvalue); } else { zfs_nicebytes(propvalue, propbuf, proplen); } return (0); } /* * Returns the name of the given zfs handle. */ const char * zfs_get_name(const zfs_handle_t *zhp) { return (zhp->zfs_name); } /* * Returns the name of the parent pool for the given zfs handle. */ const char * zfs_get_pool_name(const zfs_handle_t *zhp) { return (zhp->zpool_hdl->zpool_name); } /* * Returns the type of the given zfs handle. */ zfs_type_t zfs_get_type(const zfs_handle_t *zhp) { return (zhp->zfs_type); } /* * Is one dataset name a child dataset of another? * * Needs to handle these cases: * Dataset 1 "a/foo" "a/foo" "a/foo" "a/foo" * Dataset 2 "a/fo" "a/foobar" "a/bar/baz" "a/foo/bar" * Descendant? No. No. No. Yes. */ static boolean_t is_descendant(const char *ds1, const char *ds2) { size_t d1len = strlen(ds1); /* ds2 can't be a descendant if it's smaller */ if (strlen(ds2) < d1len) return (B_FALSE); /* otherwise, compare strings and verify that there's a '/' char */ return (ds2[d1len] == '/' && (strncmp(ds1, ds2, d1len) == 0)); } /* * Given a complete name, return just the portion that refers to the parent. * Will return -1 if there is no parent (path is just the name of the * pool). */ static int parent_name(const char *path, char *buf, size_t buflen) { char *slashp; (void) strlcpy(buf, path, buflen); if ((slashp = strrchr(buf, '/')) == NULL) return (-1); *slashp = '\0'; return (0); } int zfs_parent_name(zfs_handle_t *zhp, char *buf, size_t buflen) { return (parent_name(zfs_get_name(zhp), buf, buflen)); } /* * If accept_ancestor is false, then check to make sure that the given path has * a parent, and that it exists. If accept_ancestor is true, then find the * closest existing ancestor for the given path. In prefixlen return the * length of already existing prefix of the given path. We also fetch the * 'zoned' property, which is used to validate property settings when creating * new datasets. */ static int check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned, boolean_t accept_ancestor, int *prefixlen) { zfs_cmd_t zc = {"\0"}; char parent[ZFS_MAX_DATASET_NAME_LEN]; char *slash; zfs_handle_t *zhp; char errbuf[1024]; uint64_t is_zoned; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); /* get parent, and check to see if this is just a pool */ if (parent_name(path, parent, sizeof (parent)) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing dataset name")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } /* check to see if the pool exists */ if ((slash = strchr(parent, '/')) == NULL) slash = parent + strlen(parent); (void) strncpy(zc.zc_name, parent, slash - parent); zc.zc_name[slash - parent] = '\0'; if (zfs_ioctl(hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0 && errno == ENOENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool '%s'"), zc.zc_name); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } /* check to see if the parent dataset exists */ while ((zhp = make_dataset_handle(hdl, parent)) == NULL) { if (errno == ENOENT && accept_ancestor) { /* * Go deeper to find an ancestor, give up on top level. */ if (parent_name(parent, parent, sizeof (parent)) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool '%s'"), zc.zc_name); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } } else if (errno == ENOENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent does not exist")); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } else return (zfs_standard_error(hdl, errno, errbuf)); } is_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); if (zoned != NULL) *zoned = is_zoned; /* we are in a non-global zone, but parent is in the global zone */ if (getzoneid() != GLOBAL_ZONEID && !is_zoned) { (void) zfs_standard_error(hdl, EPERM, errbuf); zfs_close(zhp); return (-1); } /* make sure parent is a filesystem */ if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent is not a filesystem")); (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); zfs_close(zhp); return (-1); } zfs_close(zhp); if (prefixlen != NULL) *prefixlen = strlen(parent); return (0); } /* * Finds whether the dataset of the given type(s) exists. */ boolean_t zfs_dataset_exists(libzfs_handle_t *hdl, const char *path, zfs_type_t types) { zfs_handle_t *zhp; if (!zfs_validate_name(hdl, path, types, B_FALSE)) return (B_FALSE); /* * Try to get stats for the dataset, which will tell us if it exists. */ if ((zhp = make_dataset_handle(hdl, path)) != NULL) { int ds_type = zhp->zfs_type; zfs_close(zhp); if (types & ds_type) return (B_TRUE); } return (B_FALSE); } /* * Given a path to 'target', create all the ancestors between * the prefixlen portion of the path, and the target itself. * Fail if the initial prefixlen-ancestor does not already exist. */ int create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) { zfs_handle_t *h; char *cp; const char *opname; /* make sure prefix exists */ cp = target + prefixlen; if (*cp != '/') { assert(strchr(cp, '/') == NULL); h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); } else { *cp = '\0'; h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); *cp = '/'; } if (h == NULL) return (-1); zfs_close(h); /* * Attempt to create, mount, and share any ancestor filesystems, * up to the prefixlen-long one. */ for (cp = target + prefixlen + 1; (cp = strchr(cp, '/')) != NULL; *cp = '/', cp++) { *cp = '\0'; h = make_dataset_handle(hdl, target); if (h) { /* it already exists, nothing to do here */ zfs_close(h); continue; } if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM, NULL) != 0) { opname = dgettext(TEXT_DOMAIN, "create"); goto ancestorerr; } h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); if (h == NULL) { opname = dgettext(TEXT_DOMAIN, "open"); goto ancestorerr; } if (zfs_mount(h, NULL, 0) != 0) { opname = dgettext(TEXT_DOMAIN, "mount"); goto ancestorerr; } if (zfs_share(h) != 0) { opname = dgettext(TEXT_DOMAIN, "share"); goto ancestorerr; } zfs_close(h); } zfs_commit_all_shares(); return (0); ancestorerr: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to %s ancestor '%s'"), opname, target); return (-1); } /* * Creates non-existing ancestors of the given path. */ int zfs_create_ancestors(libzfs_handle_t *hdl, const char *path) { int prefix; char *path_copy; char errbuf[1024]; int rc = 0; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); /* * Check that we are not passing the nesting limit * before we start creating any ancestors. */ if (dataset_nestcheck(path) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "maximum name nesting depth exceeded")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0) return (-1); if ((path_copy = strdup(path)) != NULL) { rc = create_parents(hdl, path_copy, prefix); free(path_copy); } if (path_copy == NULL || rc != 0) return (-1); return (0); } /* * Create a new filesystem or volume. */ int zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, nvlist_t *props) { int ret; uint64_t size = 0; uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); uint64_t zoned; enum lzc_dataset_type ost; zpool_handle_t *zpool_handle; uint8_t *wkeydata = NULL; uint_t wkeylen = 0; char errbuf[1024]; char parent[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); /* validate the path, taking care to note the extended error message */ if (!zfs_validate_name(hdl, path, type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); if (dataset_nestcheck(path) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "maximum name nesting depth exceeded")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } /* validate parents exist */ if (check_parents(hdl, path, &zoned, B_FALSE, NULL) != 0) return (-1); /* * The failure modes when creating a dataset of a different type over * one that already exists is a little strange. In particular, if you * try to create a dataset on top of an existing dataset, the ioctl() * will return ENOENT, not EEXIST. To prevent this from happening, we * first try to see if the dataset exists. */ if (zfs_dataset_exists(hdl, path, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset already exists")); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); } if (type == ZFS_TYPE_VOLUME) ost = LZC_DATSET_TYPE_ZVOL; else ost = LZC_DATSET_TYPE_ZFS; /* open zpool handle for prop validation */ char pool_path[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(pool_path, path, sizeof (pool_path)); /* truncate pool_path at first slash */ char *p = strchr(pool_path, '/'); if (p != NULL) *p = '\0'; if ((zpool_handle = zpool_open(hdl, pool_path)) == NULL) return (-1); if (props && (props = zfs_valid_proplist(hdl, type, props, zoned, NULL, zpool_handle, B_TRUE, errbuf)) == 0) { zpool_close(zpool_handle); return (-1); } zpool_close(zpool_handle); if (type == ZFS_TYPE_VOLUME) { /* * If we are creating a volume, the size and block size must * satisfy a few restraints. First, the blocksize must be a * valid block size between SPA_{MIN,MAX}BLOCKSIZE. Second, the * volsize must be a multiple of the block size, and cannot be * zero. */ if (props == NULL || nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &size) != 0) { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing volume size")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } if ((ret = nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &blocksize)) != 0) { if (ret == ENOENT) { blocksize = zfs_prop_default_numeric( ZFS_PROP_VOLBLOCKSIZE); } else { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing volume block size")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } } if (size == 0) { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "volume size cannot be zero")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } if (size % blocksize != 0) { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "volume size must be a multiple of volume block " "size")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } } (void) parent_name(path, parent, sizeof (parent)); if (zfs_crypto_create(hdl, parent, props, NULL, B_TRUE, &wkeydata, &wkeylen) != 0) { nvlist_free(props); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); } /* create the dataset */ ret = lzc_create(path, ost, props, wkeydata, wkeylen); nvlist_free(props); if (wkeydata != NULL) free(wkeydata); /* check for failure */ if (ret != 0) { switch (errno) { case ENOENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such parent '%s'"), parent); return (zfs_error(hdl, EZFS_NOENT, errbuf)); case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to set this " "property or value")); return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); case EACCES: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption root's key is not loaded " "or provided")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); case ERANGE: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property value(s) specified")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); #ifdef _ILP32 case EOVERFLOW: /* * This platform can't address a volume this big. */ if (type == ZFS_TYPE_VOLUME) return (zfs_error(hdl, EZFS_VOLTOOBIG, errbuf)); #endif /* FALLTHROUGH */ default: return (zfs_standard_error(hdl, errno, errbuf)); } } return (0); } /* * Destroys the given dataset. The caller must make sure that the filesystem * isn't mounted, and that there are no active dependents. If the file system * does not exist this function does nothing. */ int zfs_destroy(zfs_handle_t *zhp, boolean_t defer) { int error; if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT && defer) return (EINVAL); if (zhp->zfs_type == ZFS_TYPE_BOOKMARK) { nvlist_t *nv = fnvlist_alloc(); fnvlist_add_boolean(nv, zhp->zfs_name); error = lzc_destroy_bookmarks(nv, NULL); fnvlist_free(nv); if (error != 0) { return (zfs_standard_error_fmt(zhp->zfs_hdl, error, dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), zhp->zfs_name)); } return (0); } if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { nvlist_t *nv = fnvlist_alloc(); fnvlist_add_boolean(nv, zhp->zfs_name); error = lzc_destroy_snaps(nv, defer, NULL); fnvlist_free(nv); } else { error = lzc_destroy(zhp->zfs_name); } if (error != 0 && error != ENOENT) { return (zfs_standard_error_fmt(zhp->zfs_hdl, errno, dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), zhp->zfs_name)); } remove_mountpoint(zhp); return (0); } struct destroydata { nvlist_t *nvl; const char *snapname; }; static int zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) { struct destroydata *dd = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, dd->snapname) >= sizeof (name)) return (EINVAL); if (lzc_exists(name)) verify(nvlist_add_boolean(dd->nvl, name) == 0); rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, dd); zfs_close(zhp); return (rv); } /* * Destroys all snapshots with the given name in zhp & descendants. */ int zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) { int ret; struct destroydata dd = { 0 }; dd.snapname = snapname; verify(nvlist_alloc(&dd.nvl, NV_UNIQUE_NAME, 0) == 0); (void) zfs_check_snap_cb(zfs_handle_dup(zhp), &dd); if (nvlist_empty(dd.nvl)) { ret = zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT, dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"), zhp->zfs_name, snapname); } else { ret = zfs_destroy_snaps_nvl(zhp->zfs_hdl, dd.nvl, defer); } nvlist_free(dd.nvl); return (ret); } /* * Destroys all the snapshots named in the nvlist. */ int zfs_destroy_snaps_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, boolean_t defer) { int ret; nvlist_t *errlist = NULL; nvpair_t *pair; ret = lzc_destroy_snaps(snaps, defer, &errlist); if (ret == 0) { nvlist_free(errlist); return (0); } if (nvlist_empty(errlist)) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot destroy snapshots")); ret = zfs_standard_error(hdl, ret, errbuf); } for (pair = nvlist_next_nvpair(errlist, NULL); pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"), nvpair_name(pair)); switch (fnvpair_value_int32(pair)) { case EEXIST: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshot is cloned")); ret = zfs_error(hdl, EZFS_EXISTS, errbuf); break; default: ret = zfs_standard_error(hdl, errno, errbuf); break; } } nvlist_free(errlist); return (ret); } /* * Clones the given dataset. The target must be of the same type as the source. */ int zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) { char parent[ZFS_MAX_DATASET_NAME_LEN]; int ret; char errbuf[1024]; libzfs_handle_t *hdl = zhp->zfs_hdl; uint64_t zoned; assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), target); /* validate the target/clone name */ if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); /* validate parents exist */ if (check_parents(hdl, target, &zoned, B_FALSE, NULL) != 0) return (-1); (void) parent_name(target, parent, sizeof (parent)); /* do the clone */ if (props) { zfs_type_t type; if (ZFS_IS_VOLUME(zhp)) { type = ZFS_TYPE_VOLUME; } else { type = ZFS_TYPE_FILESYSTEM; } if ((props = zfs_valid_proplist(hdl, type, props, zoned, zhp, zhp->zpool_hdl, B_TRUE, errbuf)) == NULL) return (-1); if (zfs_fix_auto_resv(zhp, props) == -1) { nvlist_free(props); return (-1); } } if (zfs_crypto_clone_check(hdl, zhp, parent, props) != 0) { nvlist_free(props); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); } ret = lzc_clone(target, zhp->zfs_name, props); nvlist_free(props); if (ret != 0) { switch (errno) { case ENOENT: /* * The parent doesn't exist. We should have caught this * above, but there may a race condition that has since * destroyed the parent. * * At this point, we don't know whether it's the source * that doesn't exist anymore, or whether the target * dataset doesn't exist. */ zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "no such parent '%s'"), parent); return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); case EXDEV: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "source and target pools differ")); return (zfs_error(zhp->zfs_hdl, EZFS_CROSSTARGET, errbuf)); default: return (zfs_standard_error(zhp->zfs_hdl, errno, errbuf)); } } return (ret); } /* * Promotes the given clone fs to be the clone parent. */ int zfs_promote(zfs_handle_t *zhp) { libzfs_handle_t *hdl = zhp->zfs_hdl; char snapname[ZFS_MAX_DATASET_NAME_LEN]; int ret; char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot promote '%s'"), zhp->zfs_name); if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshots can not be promoted")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } if (zhp->zfs_dmustats.dds_origin[0] == '\0') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not a cloned filesystem")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } if (!zfs_validate_name(hdl, zhp->zfs_name, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); ret = lzc_promote(zhp->zfs_name, snapname, sizeof (snapname)); if (ret != 0) { switch (ret) { case EACCES: /* * Promoting encrypted dataset outside its * encryption root. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot promote dataset outside its " "encryption root")); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); case EEXIST: /* There is a conflicting snapshot name. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "conflicting snapshot '%s' from parent '%s'"), snapname, zhp->zfs_dmustats.dds_origin); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); default: return (zfs_standard_error(hdl, ret, errbuf)); } } return (ret); } typedef struct snapdata { nvlist_t *sd_nvl; const char *sd_snapname; } snapdata_t; static int zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) { snapdata_t *sd = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) == 0) { if (snprintf(name, sizeof (name), "%s@%s", zfs_get_name(zhp), sd->sd_snapname) >= sizeof (name)) return (EINVAL); fnvlist_add_boolean(sd->sd_nvl, name); rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); } zfs_close(zhp); return (rv); } /* * Creates snapshots. The keys in the snaps nvlist are the snapshots to be * created. */ int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props) { int ret; char errbuf[1024]; nvpair_t *elem; nvlist_t *errors; zpool_handle_t *zpool_hdl; char pool[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create snapshots ")); elem = NULL; while ((elem = nvlist_next_nvpair(snaps, elem)) != NULL) { const char *snapname = nvpair_name(elem); /* validate the target name */ if (!zfs_validate_name(hdl, snapname, ZFS_TYPE_SNAPSHOT, B_TRUE)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create snapshot '%s'"), snapname); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } } /* * get pool handle for prop validation. assumes all snaps are in the * same pool, as does lzc_snapshot (below). */ elem = nvlist_next_nvpair(snaps, NULL); (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); pool[strcspn(pool, "/@")] = '\0'; zpool_hdl = zpool_open(hdl, pool); if (zpool_hdl == NULL) return (-1); if (props != NULL && (props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT, props, B_FALSE, NULL, zpool_hdl, B_FALSE, errbuf)) == NULL) { zpool_close(zpool_hdl); return (-1); } zpool_close(zpool_hdl); ret = lzc_snapshot(snaps, props, &errors); if (ret != 0) { boolean_t printed = B_FALSE; for (elem = nvlist_next_nvpair(errors, NULL); elem != NULL; elem = nvlist_next_nvpair(errors, elem)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create snapshot '%s'"), nvpair_name(elem)); (void) zfs_standard_error(hdl, fnvpair_value_int32(elem), errbuf); printed = B_TRUE; } if (!printed) { switch (ret) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "multiple snapshots of same " "fs not allowed")); (void) zfs_error(hdl, EZFS_EXISTS, errbuf); break; default: (void) zfs_standard_error(hdl, ret, errbuf); } } } nvlist_free(props); nvlist_free(errors); return (ret); } int zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, nvlist_t *props) { int ret; snapdata_t sd = { 0 }; char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *cp; zfs_handle_t *zhp; char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot snapshot %s"), path); if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); (void) strlcpy(fsname, path, sizeof (fsname)); cp = strchr(fsname, '@'); *cp = '\0'; sd.sd_snapname = cp + 1; if ((zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { return (-1); } verify(nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) == 0); if (recursive) { (void) zfs_snapshot_cb(zfs_handle_dup(zhp), &sd); } else { fnvlist_add_boolean(sd.sd_nvl, path); } ret = zfs_snapshot_nvl(hdl, sd.sd_nvl, props); nvlist_free(sd.sd_nvl); zfs_close(zhp); return (ret); } /* * Destroy any more recent snapshots. We invoke this callback on any dependents * of the snapshot first. If the 'cb_dependent' member is non-zero, then this * is a dependent and we should just destroy it without checking the transaction * group. */ typedef struct rollback_data { const char *cb_target; /* the snapshot */ uint64_t cb_create; /* creation time reference */ boolean_t cb_error; boolean_t cb_force; } rollback_data_t; static int rollback_destroy_dependent(zfs_handle_t *zhp, void *data) { rollback_data_t *cbp = data; prop_changelist_t *clp; /* We must destroy this clone; first unmount it */ clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, cbp->cb_force ? MS_FORCE: 0); if (clp == NULL || changelist_prefix(clp) != 0) { cbp->cb_error = B_TRUE; zfs_close(zhp); return (0); } if (zfs_destroy(zhp, B_FALSE) != 0) cbp->cb_error = B_TRUE; else changelist_remove(clp, zhp->zfs_name); (void) changelist_postfix(clp); changelist_free(clp); zfs_close(zhp); return (0); } static int rollback_destroy(zfs_handle_t *zhp, void *data) { rollback_data_t *cbp = data; if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { cbp->cb_error |= zfs_iter_dependents(zhp, B_FALSE, rollback_destroy_dependent, cbp); cbp->cb_error |= zfs_destroy(zhp, B_FALSE); } zfs_close(zhp); return (0); } /* * Given a dataset, rollback to a specific snapshot, discarding any * data changes since then and making it the active dataset. * * Any snapshots and bookmarks more recent than the target are * destroyed, along with their dependents (i.e. clones). */ int zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) { rollback_data_t cb = { 0 }; int err; boolean_t restore_resv = 0; uint64_t old_volsize = 0, new_volsize; zfs_prop_t resv_prop = { 0 }; uint64_t min_txg = 0; assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM || zhp->zfs_type == ZFS_TYPE_VOLUME); /* * Destroy all recent snapshots and their dependents. */ cb.cb_force = force; cb.cb_target = snap->zfs_name; cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); if (cb.cb_create > 0) min_txg = cb.cb_create; (void) zfs_iter_snapshots(zhp, B_FALSE, rollback_destroy, &cb, min_txg, 0); (void) zfs_iter_bookmarks(zhp, rollback_destroy, &cb); if (cb.cb_error) return (-1); /* * Now that we have verified that the snapshot is the latest, * rollback to the given snapshot. */ if (zhp->zfs_type == ZFS_TYPE_VOLUME) { if (zfs_which_resv_prop(zhp, &resv_prop) < 0) return (-1); old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); restore_resv = (old_volsize == zfs_prop_get_int(zhp, resv_prop)); } /* * Pass both the filesystem and the wanted snapshot names, * we would get an error back if the snapshot is destroyed or * a new snapshot is created before this request is processed. */ err = lzc_rollback_to(zhp->zfs_name, snap->zfs_name); if (err != 0) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rollback '%s'"), zhp->zfs_name); switch (err) { case EEXIST: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "there is a snapshot or bookmark more recent " "than '%s'"), snap->zfs_name); (void) zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf); break; case ESRCH: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "'%s' is not found among snapshots of '%s'"), snap->zfs_name, zhp->zfs_name); (void) zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf); break; case EINVAL: (void) zfs_error(zhp->zfs_hdl, EZFS_BADTYPE, errbuf); break; default: (void) zfs_standard_error(zhp->zfs_hdl, err, errbuf); } return (err); } /* * For volumes, if the pre-rollback volsize matched the pre- * rollback reservation and the volsize has changed then set * the reservation property to the post-rollback volsize. * Make a new handle since the rollback closed the dataset. */ if ((zhp->zfs_type == ZFS_TYPE_VOLUME) && (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) { if (restore_resv) { new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); if (old_volsize != new_volsize) err = zfs_prop_set_int(zhp, resv_prop, new_volsize); } zfs_close(zhp); } return (err); } /* * Renames the given dataset. */ int -zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, - boolean_t force_unmount) +zfs_rename(zfs_handle_t *zhp, const char *target, renameflags_t flags) { int ret = 0; zfs_cmd_t zc = {"\0"}; char *delim; prop_changelist_t *cl = NULL; char parent[ZFS_MAX_DATASET_NAME_LEN]; + char property[ZFS_MAXPROPLEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[1024]; /* if we have the same exact name, just return success */ if (strcmp(zhp->zfs_name, target) == 0) return (0); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rename to '%s'"), target); /* make sure source name is valid */ if (!zfs_validate_name(hdl, zhp->zfs_name, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); /* * Make sure the target name is valid */ if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { if ((strchr(target, '@') == NULL) || *target == '@') { /* * Snapshot target name is abbreviated, * reconstruct full dataset name */ (void) strlcpy(parent, zhp->zfs_name, sizeof (parent)); delim = strchr(parent, '@'); if (strchr(target, '@') == NULL) *(++delim) = '\0'; else *delim = '\0'; (void) strlcat(parent, target, sizeof (parent)); target = parent; } else { /* * Make sure we're renaming within the same dataset. */ delim = strchr(target, '@'); if (strncmp(zhp->zfs_name, target, delim - target) != 0 || zhp->zfs_name[delim - target] != '@') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshots must be part of same " "dataset")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); } } if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } else { - if (recursive) { + if (flags.recursive) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "recursive rename must be a snapshot")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); /* validate parents */ if (check_parents(hdl, target, NULL, B_FALSE, NULL) != 0) return (-1); /* make sure we're in the same pool */ verify((delim = strchr(target, '/')) != NULL); if (strncmp(zhp->zfs_name, target, delim - target) != 0 || zhp->zfs_name[delim - target] != '/') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "datasets must be within same pool")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); } /* new name cannot be a child of the current dataset name */ if (is_descendant(zhp->zfs_name, target)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "New dataset name cannot be a descendant of " "current dataset name")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } } (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zhp->zfs_name); if (getzoneid() == GLOBAL_ZONEID && zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset is used in a non-global zone")); return (zfs_error(hdl, EZFS_ZONED, errbuf)); } - if (recursive) { - zfs_handle_t *zhrp; + /* + * Avoid unmounting file systems with mountpoint property set to + * 'legacy' or 'none' even if -u option is not given. + */ + if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM && + !flags.recursive && !flags.nounmount && + zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, property, + sizeof (property), NULL, NULL, 0, B_FALSE) == 0 && + (strcmp(property, "legacy") == 0 || + strcmp(property, "none") == 0)) { + flags.nounmount = B_TRUE; + } + if (flags.recursive) { char *parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name); if (parentname == NULL) { ret = -1; goto error; } delim = strchr(parentname, '@'); *delim = '\0'; - zhrp = zfs_open(zhp->zfs_hdl, parentname, ZFS_TYPE_DATASET); + zfs_handle_t *zhrp = zfs_open(zhp->zfs_hdl, parentname, + ZFS_TYPE_DATASET); free(parentname); if (zhrp == NULL) { ret = -1; goto error; } zfs_close(zhrp); } else if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) { if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, + flags.nounmount ? CL_GATHER_DONT_UNMOUNT : CL_GATHER_ITER_MOUNTED, - force_unmount ? MS_FORCE : 0)) == NULL) + flags.forceunmount ? MS_FORCE : 0)) == NULL) return (-1); if (changelist_haszonedchild(cl)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "child dataset with inherited mountpoint is used " "in a non-global zone")); (void) zfs_error(hdl, EZFS_ZONED, errbuf); ret = -1; goto error; } if ((ret = changelist_prefix(cl)) != 0) goto error; } if (ZFS_IS_VOLUME(zhp)) zc.zc_objset_type = DMU_OST_ZVOL; else zc.zc_objset_type = DMU_OST_ZFS; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value)); - zc.zc_cookie = recursive; + zc.zc_cookie = !!flags.recursive; + zc.zc_cookie |= (!!flags.nounmount) << 1; if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) { /* * if it was recursive, the one that actually failed will * be in zc.zc_name */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zc.zc_name); - if (recursive && errno == EEXIST) { + if (flags.recursive && errno == EEXIST) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "a child dataset already has a snapshot " "with the new name")); (void) zfs_error(hdl, EZFS_EXISTS, errbuf); } else if (errno == EACCES) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot move encrypted child outside of " "its encryption root")); (void) zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); } else { (void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf); } /* * On failure, we still want to remount any filesystems that * were previously mounted, so we don't alter the system state. */ if (cl != NULL) (void) changelist_postfix(cl); } else { if (cl != NULL) { changelist_rename(cl, zfs_get_name(zhp), target); ret = changelist_postfix(cl); } } error: if (cl != NULL) { changelist_free(cl); } return (ret); } nvlist_t * zfs_get_all_props(zfs_handle_t *zhp) { return (zhp->zfs_props); } nvlist_t * zfs_get_recvd_props(zfs_handle_t *zhp) { if (zhp->zfs_recvd_props == NULL) if (get_recvd_props_ioctl(zhp) != 0) return (NULL); return (zhp->zfs_recvd_props); } nvlist_t * zfs_get_user_props(zfs_handle_t *zhp) { return (zhp->zfs_user_props); } /* * This function is used by 'zfs list' to determine the exact set of columns to * display, and their maximum widths. This does two main things: * * - If this is a list of all properties, then expand the list to include * all native properties, and set a flag so that for each dataset we look * for new unique user properties and add them to the list. * * - For non fixed-width properties, keep track of the maximum width seen * so that we can size the column appropriately. If the user has * requested received property values, we also need to compute the width * of the RECEIVED column. */ int zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received, boolean_t literal) { libzfs_handle_t *hdl = zhp->zfs_hdl; zprop_list_t *entry; zprop_list_t **last, **start; nvlist_t *userprops, *propval; nvpair_t *elem; char *strval; char buf[ZFS_MAXPROPLEN]; if (zprop_expand_list(hdl, plp, ZFS_TYPE_DATASET) != 0) return (-1); userprops = zfs_get_user_props(zhp); entry = *plp; if (entry->pl_all && nvlist_next_nvpair(userprops, NULL) != NULL) { /* * Go through and add any user properties as necessary. We * start by incrementing our list pointer to the first * non-native property. */ start = plp; while (*start != NULL) { if ((*start)->pl_prop == ZPROP_INVAL) break; start = &(*start)->pl_next; } elem = NULL; while ((elem = nvlist_next_nvpair(userprops, elem)) != NULL) { /* * See if we've already found this property in our list. */ for (last = start; *last != NULL; last = &(*last)->pl_next) { if (strcmp((*last)->pl_user_prop, nvpair_name(elem)) == 0) break; } if (*last == NULL) { if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL || ((entry->pl_user_prop = zfs_strdup(hdl, nvpair_name(elem)))) == NULL) { free(entry); return (-1); } entry->pl_prop = ZPROP_INVAL; entry->pl_width = strlen(nvpair_name(elem)); entry->pl_all = B_TRUE; *last = entry; } } } /* * Now go through and check the width of any non-fixed columns */ for (entry = *plp; entry != NULL; entry = entry->pl_next) { if (entry->pl_fixed && !literal) continue; if (entry->pl_prop != ZPROP_INVAL) { if (zfs_prop_get(zhp, entry->pl_prop, buf, sizeof (buf), NULL, NULL, 0, literal) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } if (received && zfs_prop_get_recvd(zhp, zfs_prop_to_name(entry->pl_prop), buf, sizeof (buf), literal) == 0) if (strlen(buf) > entry->pl_recvd_width) entry->pl_recvd_width = strlen(buf); } else { if (nvlist_lookup_nvlist(userprops, entry->pl_user_prop, &propval) == 0) { verify(nvlist_lookup_string(propval, ZPROP_VALUE, &strval) == 0); if (strlen(strval) > entry->pl_width) entry->pl_width = strlen(strval); } if (received && zfs_prop_get_recvd(zhp, entry->pl_user_prop, buf, sizeof (buf), literal) == 0) if (strlen(buf) > entry->pl_recvd_width) entry->pl_recvd_width = strlen(buf); } } return (0); } void zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props) { nvpair_t *curr; nvpair_t *next; /* * Keep a reference to the props-table against which we prune the * properties. */ zhp->zfs_props_table = props; curr = nvlist_next_nvpair(zhp->zfs_props, NULL); while (curr) { zfs_prop_t zfs_prop = zfs_name_to_prop(nvpair_name(curr)); next = nvlist_next_nvpair(zhp->zfs_props, curr); /* * User properties will result in ZPROP_INVAL, and since we * only know how to prune standard ZFS properties, we always * leave these in the list. This can also happen if we * encounter an unknown DSL property (when running older * software, for example). */ if (zfs_prop != ZPROP_INVAL && props[zfs_prop] == B_FALSE) (void) nvlist_remove(zhp->zfs_props, nvpair_name(curr), nvpair_type(curr)); curr = next; } } static int zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path, zfs_smb_acl_op_t cmd, char *resource1, char *resource2) { zfs_cmd_t zc = {"\0"}; nvlist_t *nvlist = NULL; int error; (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value)); zc.zc_cookie = (uint64_t)cmd; if (cmd == ZFS_SMB_ACL_RENAME) { if (nvlist_alloc(&nvlist, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (0); } } switch (cmd) { case ZFS_SMB_ACL_ADD: case ZFS_SMB_ACL_REMOVE: (void) strlcpy(zc.zc_string, resource1, sizeof (zc.zc_string)); break; case ZFS_SMB_ACL_RENAME: if (nvlist_add_string(nvlist, ZFS_SMB_ACL_SRC, resource1) != 0) { (void) no_memory(hdl); return (-1); } if (nvlist_add_string(nvlist, ZFS_SMB_ACL_TARGET, resource2) != 0) { (void) no_memory(hdl); return (-1); } if (zcmd_write_src_nvlist(hdl, &zc, nvlist) != 0) { nvlist_free(nvlist); return (-1); } break; case ZFS_SMB_ACL_PURGE: break; default: return (-1); } error = ioctl(hdl->libzfs_fd, ZFS_IOC_SMB_ACL, &zc); nvlist_free(nvlist); return (error); } int zfs_smb_acl_add(libzfs_handle_t *hdl, char *dataset, char *path, char *resource) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_ADD, resource, NULL)); } int zfs_smb_acl_remove(libzfs_handle_t *hdl, char *dataset, char *path, char *resource) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_REMOVE, resource, NULL)); } int zfs_smb_acl_purge(libzfs_handle_t *hdl, char *dataset, char *path) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_PURGE, NULL, NULL)); } int zfs_smb_acl_rename(libzfs_handle_t *hdl, char *dataset, char *path, char *oldname, char *newname) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_RENAME, oldname, newname)); } int zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, zfs_userspace_cb_t func, void *arg) { zfs_cmd_t zc = {"\0"}; zfs_useracct_t buf[100]; libzfs_handle_t *hdl = zhp->zfs_hdl; int ret; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_objset_type = type; zc.zc_nvlist_dst = (uintptr_t)buf; for (;;) { zfs_useracct_t *zua = buf; zc.zc_nvlist_dst_size = sizeof (buf); if (zfs_ioctl(hdl, ZFS_IOC_USERSPACE_MANY, &zc) != 0) { char errbuf[1024]; if ((errno == ENOTSUP && (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || type == ZFS_PROP_PROJECTOBJUSED || type == ZFS_PROP_PROJECTOBJQUOTA || type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTQUOTA))) break; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot get used/quota for %s"), zc.zc_name); return (zfs_standard_error_fmt(hdl, errno, errbuf)); } if (zc.zc_nvlist_dst_size == 0) break; while (zc.zc_nvlist_dst_size > 0) { if ((ret = func(arg, zua->zu_domain, zua->zu_rid, zua->zu_space)) != 0) return (ret); zua++; zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t); } } return (0); } struct holdarg { nvlist_t *nvl; const char *snapname; const char *tag; boolean_t recursive; int error; }; static int zfs_hold_one(zfs_handle_t *zhp, void *arg) { struct holdarg *ha = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, ha->snapname) >= sizeof (name)) return (EINVAL); if (lzc_exists(name)) fnvlist_add_string(ha->nvl, name, ha->tag); if (ha->recursive) rv = zfs_iter_filesystems(zhp, zfs_hold_one, ha); zfs_close(zhp); return (rv); } int zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, boolean_t recursive, int cleanup_fd) { int ret; struct holdarg ha; ha.nvl = fnvlist_alloc(); ha.snapname = snapname; ha.tag = tag; ha.recursive = recursive; (void) zfs_hold_one(zfs_handle_dup(zhp), &ha); if (nvlist_empty(ha.nvl)) { char errbuf[1024]; fnvlist_free(ha.nvl); ret = ENOENT; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot hold snapshot '%s@%s'"), zhp->zfs_name, snapname); (void) zfs_standard_error(zhp->zfs_hdl, ret, errbuf); return (ret); } ret = zfs_hold_nvl(zhp, cleanup_fd, ha.nvl); fnvlist_free(ha.nvl); return (ret); } int zfs_hold_nvl(zfs_handle_t *zhp, int cleanup_fd, nvlist_t *holds) { int ret; nvlist_t *errors; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[1024]; nvpair_t *elem; errors = NULL; ret = lzc_hold(holds, cleanup_fd, &errors); if (ret == 0) { /* There may be errors even in the success case. */ fnvlist_free(errors); return (0); } if (nvlist_empty(errors)) { /* no hold-specific errors */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot hold")); switch (ret) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); break; default: (void) zfs_standard_error(hdl, ret, errbuf); } } for (elem = nvlist_next_nvpair(errors, NULL); elem != NULL; elem = nvlist_next_nvpair(errors, elem)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot hold snapshot '%s'"), nvpair_name(elem)); switch (fnvpair_value_int32(elem)) { case E2BIG: /* * Temporary tags wind up having the ds object id * prepended. So even if we passed the length check * above, it's still possible for the tag to wind * up being slightly too long. */ (void) zfs_error(hdl, EZFS_TAGTOOLONG, errbuf); break; case EINVAL: (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case EEXIST: (void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf); break; default: (void) zfs_standard_error(hdl, fnvpair_value_int32(elem), errbuf); } } fnvlist_free(errors); return (ret); } static int zfs_release_one(zfs_handle_t *zhp, void *arg) { struct holdarg *ha = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; nvlist_t *existing_holds; if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, ha->snapname) >= sizeof (name)) { ha->error = EINVAL; rv = EINVAL; } if (lzc_get_holds(name, &existing_holds) != 0) { ha->error = ENOENT; } else if (!nvlist_exists(existing_holds, ha->tag)) { ha->error = ESRCH; } else { nvlist_t *torelease = fnvlist_alloc(); fnvlist_add_boolean(torelease, ha->tag); fnvlist_add_nvlist(ha->nvl, name, torelease); fnvlist_free(torelease); } if (ha->recursive) rv = zfs_iter_filesystems(zhp, zfs_release_one, ha); zfs_close(zhp); return (rv); } int zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, boolean_t recursive) { int ret; struct holdarg ha; nvlist_t *errors = NULL; nvpair_t *elem; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[1024]; ha.nvl = fnvlist_alloc(); ha.snapname = snapname; ha.tag = tag; ha.recursive = recursive; ha.error = 0; (void) zfs_release_one(zfs_handle_dup(zhp), &ha); if (nvlist_empty(ha.nvl)) { fnvlist_free(ha.nvl); ret = ha.error; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot release hold from snapshot '%s@%s'"), zhp->zfs_name, snapname); if (ret == ESRCH) { (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); } else { (void) zfs_standard_error(hdl, ret, errbuf); } return (ret); } ret = lzc_release(ha.nvl, &errors); fnvlist_free(ha.nvl); if (ret == 0) { /* There may be errors even in the success case. */ fnvlist_free(errors); return (0); } if (nvlist_empty(errors)) { /* no hold-specific errors */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot release")); switch (errno) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; default: (void) zfs_standard_error_fmt(hdl, errno, errbuf); } } for (elem = nvlist_next_nvpair(errors, NULL); elem != NULL; elem = nvlist_next_nvpair(errors, elem)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot release hold from snapshot '%s'"), nvpair_name(elem)); switch (fnvpair_value_int32(elem)) { case ESRCH: (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); break; case EINVAL: (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); break; default: (void) zfs_standard_error_fmt(hdl, fnvpair_value_int32(elem), errbuf); } } fnvlist_free(errors); return (ret); } int zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zfs_hdl; int nvsz = 2048; void *nvbuf; int err = 0; char errbuf[1024]; assert(zhp->zfs_type == ZFS_TYPE_VOLUME || zhp->zfs_type == ZFS_TYPE_FILESYSTEM); tryagain: nvbuf = malloc(nvsz); if (nvbuf == NULL) { err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno))); goto out; } zc.zc_nvlist_dst_size = nvsz; zc.zc_nvlist_dst = (uintptr_t)nvbuf; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_GET_FSACL, &zc) != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"), zc.zc_name); switch (errno) { case ENOMEM: free(nvbuf); nvsz = zc.zc_nvlist_dst_size; goto tryagain; case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); err = zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: err = zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case ENOENT: err = zfs_error(hdl, EZFS_NOENT, errbuf); break; default: err = zfs_standard_error_fmt(hdl, errno, errbuf); break; } } else { /* success */ int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0); if (rc) { (void) snprintf(errbuf, sizeof (errbuf), dgettext( TEXT_DOMAIN, "cannot get permissions on '%s'"), zc.zc_name); err = zfs_standard_error_fmt(hdl, rc, errbuf); } } free(nvbuf); out: return (err); } int zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zfs_hdl; char *nvbuf; char errbuf[1024]; size_t nvsz; int err; assert(zhp->zfs_type == ZFS_TYPE_VOLUME || zhp->zfs_type == ZFS_TYPE_FILESYSTEM); err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE); assert(err == 0); nvbuf = malloc(nvsz); err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0); assert(err == 0); zc.zc_nvlist_src_size = nvsz; zc.zc_nvlist_src = (uintptr_t)nvbuf; zc.zc_perm_action = un; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_SET_FSACL, &zc) != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set permissions on '%s'"), zc.zc_name); switch (errno) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); err = zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: err = zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case ENOENT: err = zfs_error(hdl, EZFS_NOENT, errbuf); break; default: err = zfs_standard_error_fmt(hdl, errno, errbuf); break; } } free(nvbuf); return (err); } int zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) { int err; char errbuf[1024]; err = lzc_get_holds(zhp->zfs_name, nvl); if (err != 0) { libzfs_handle_t *hdl = zhp->zfs_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"), zhp->zfs_name); switch (err) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); err = zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: err = zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case ENOENT: err = zfs_error(hdl, EZFS_NOENT, errbuf); break; default: err = zfs_standard_error_fmt(hdl, errno, errbuf); break; } } return (err); } /* * The theory of raidz space accounting * * The "referenced" property of RAIDZ vdevs is scaled such that a 128KB block * will "reference" 128KB, even though it allocates more than that, to store the * parity information (and perhaps skip sectors). This concept of the * "referenced" (and other DMU space accounting) being lower than the allocated * space by a constant factor is called "raidz deflation." * * As mentioned above, the constant factor for raidz deflation assumes a 128KB * block size. However, zvols typically have a much smaller block size (default * 8KB). These smaller blocks may require proportionally much more parity * information (and perhaps skip sectors). In this case, the change to the * "referenced" property may be much more than the logical block size. * * Suppose a raidz vdev has 5 disks with ashift=12. A 128k block may be written * as follows. * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | D8 | D16 | D24 | * | P1 | D1 | D9 | D17 | D25 | * | P2 | D2 | D10 | D18 | D26 | * | P3 | D3 | D11 | D19 | D27 | * | P4 | D4 | D12 | D20 | D28 | * | P5 | D5 | D13 | D21 | D29 | * | P6 | D6 | D14 | D22 | D30 | * | P7 | D7 | D15 | D23 | D31 | * +-------+-------+-------+-------+-------+ * * Above, notice that 160k was allocated: 8 x 4k parity sectors + 32 x 4k data * sectors. The dataset's referenced will increase by 128k and the pool's * allocated and free properties will be adjusted by 160k. * * A 4k block written to the same raidz vdev will require two 4k sectors. The * blank cells represent unallocated space. * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | | | | * +-------+-------+-------+-------+-------+ * * Above, notice that the 4k block required one sector for parity and another * for data. vdev_raidz_asize() will return 8k and as such the pool's allocated * and free properties will be adjusted by 8k. The dataset will not be charged * 8k. Rather, it will be charged a value that is scaled according to the * overhead of the 128k block on the same vdev. This 8k allocation will be * charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as * calculated in the 128k block example above. * * Every raidz allocation is sized to be a multiple of nparity+1 sectors. That * is, every raidz1 allocation will be a multiple of 2 sectors, raidz2 * allocations are a multiple of 3 sectors, and raidz3 allocations are a * multiple of of 4 sectors. When a block does not fill the required number of * sectors, skip blocks (sectors) are used. * * An 8k block being written to a raidz vdev may be written as follows: * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | D1 | S0 | | * +-------+-------+-------+-------+-------+ * * In order to maintain the nparity+1 allocation size, a skip block (S0) was * added. For this 8k block, the pool's allocated and free properties are * adjusted by 16k and the dataset's referenced is increased by 16k * 128k / * 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in * the 128k block example above. * * Compression may lead to a variety of block sizes being written for the same * volume or file. There is no clear way to reserve just the amount of space * that will be required, so the worst case (no compression) is assumed. * Note that metadata blocks will typically be compressed, so the reservation * size returned by zvol_volsize_to_reservation() will generally be slightly * larger than the maximum that the volume can reference. */ /* * Derived from function of same name in module/zfs/vdev_raidz.c. Returns the * amount of space (in bytes) that will be allocated for the specified block * size. Note that the "referenced" space accounted will be less than this, but * not necessarily equal to "blksize", due to RAIDZ deflation. */ static uint64_t vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, uint64_t blksize) { uint64_t asize, ndata; ASSERT3U(ndisks, >, nparity); ndata = ndisks - nparity; asize = ((blksize - 1) >> ashift) + 1; asize += nparity * ((asize + ndata - 1) / ndata); asize = roundup(asize, nparity + 1) << ashift; return (asize); } /* * Determine how much space will be allocated if it lands on the most space- * inefficient top-level vdev. Returns the size in bytes required to store one * copy of the volume data. See theory comment above. */ static uint64_t volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) { nvlist_t *config, *tree, **vdevs; uint_t nvdevs, v; uint64_t ret = 0; config = zpool_get_config(zhp, NULL); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &vdevs, &nvdevs) != 0) { return (nblocks * blksize); } for (v = 0; v < nvdevs; v++) { char *type; uint64_t nparity, ashift, asize, tsize; nvlist_t **disks; uint_t ndisks; uint64_t volsize; if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE, &type) != 0 || strcmp(type, VDEV_TYPE_RAIDZ) != 0 || nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY, &nparity) != 0 || nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT, &ashift) != 0 || nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN, &disks, &ndisks) != 0) { continue; } /* allocation size for the "typical" 128k block */ tsize = vdev_raidz_asize(ndisks, nparity, ashift, SPA_OLD_MAXBLOCKSIZE); /* allocation size for the blksize block */ asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize); /* * Scale this size down as a ratio of 128k / tsize. See theory * statement above. */ volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize; if (volsize > ret) { ret = volsize; } } if (ret == 0) { ret = nblocks * blksize; } return (ret); } /* * Convert the zvol's volume size to an appropriate reservation. See theory * comment above. * * Note: If this routine is updated, it is necessary to update the ZFS test * suite's shell version in reservation.shlib. */ uint64_t zvol_volsize_to_reservation(zpool_handle_t *zph, uint64_t volsize, nvlist_t *props) { uint64_t numdb; uint64_t nblocks, volblocksize; int ncopies; char *strval; if (nvlist_lookup_string(props, zfs_prop_to_name(ZFS_PROP_COPIES), &strval) == 0) ncopies = atoi(strval); else ncopies = 1; if (nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) volblocksize = ZVOL_DEFAULT_BLOCKSIZE; nblocks = volsize / volblocksize; /* * Metadata defaults to using 128k blocks, not volblocksize blocks. For * this reason, only the data blocks are scaled based on vdev config. */ volsize = volsize_from_vdevs(zph, nblocks, volblocksize); /* start with metadnode L0-L6 */ numdb = 7; /* calculate number of indirects */ while (nblocks > 1) { nblocks += DNODES_PER_LEVEL - 1; nblocks /= DNODES_PER_LEVEL; numdb += nblocks; } numdb *= MIN(SPA_DVAS_PER_BP, ncopies + 1); volsize *= ncopies; /* * this is exactly DN_MAX_INDBLKSHIFT when metadata isn't * compressed, but in practice they compress down to about * 1100 bytes */ numdb *= 1ULL << DN_MAX_INDBLKSHIFT; volsize += numdb; return (volsize); } /* * Wait for the given activity and return the status of the wait (whether or not * any waiting was done) in the 'waited' parameter. Non-existent fses are * reported via the 'missing' parameter, rather than by printing an error * message. This is convenient when this function is called in a loop over a * long period of time (as it is, for example, by zfs's wait cmd). In that * scenario, a fs being exported or destroyed should be considered a normal * event, so we don't want to print an error when we find that the fs doesn't * exist. */ int zfs_wait_status(zfs_handle_t *zhp, zfs_wait_activity_t activity, boolean_t *missing, boolean_t *waited) { int error = lzc_wait_fs(zhp->zfs_name, activity, waited); *missing = (error == ENOENT); if (*missing) return (0); if (error != 0) { (void) zfs_standard_error_fmt(zhp->zfs_hdl, error, dgettext(TEXT_DOMAIN, "error waiting in fs '%s'"), zhp->zfs_name); } return (error); } Index: vendor-sys/openzfs/dist/lib/libzpool/kernel.c =================================================================== --- vendor-sys/openzfs/dist/lib/libzpool/kernel.c (revision 365339) +++ vendor-sys/openzfs/dist/lib/libzpool/kernel.c (revision 365340) @@ -1,1410 +1,1416 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include /* * Emulation of kernel services in userland. */ uint64_t physmem; char hw_serial[HW_HOSTID_LEN]; struct utsname hw_utsname; /* If set, all blocks read will be copied to the specified directory. */ char *vn_dumpdir = NULL; /* this only exists to have its address taken */ struct proc p0; /* * ========================================================================= * threads * ========================================================================= * * TS_STACK_MIN is dictated by the minimum allowed pthread stack size. While * TS_STACK_MAX is somewhat arbitrary, it was selected to be large enough for * the expected stack depth while small enough to avoid exhausting address * space with high thread counts. */ #define TS_STACK_MIN MAX(PTHREAD_STACK_MIN, 32768) #define TS_STACK_MAX (256 * 1024) /*ARGSUSED*/ kthread_t * zk_thread_create(void (*func)(void *), void *arg, size_t stksize, int state) { pthread_attr_t attr; pthread_t tid; char *stkstr; int detachstate = PTHREAD_CREATE_DETACHED; VERIFY0(pthread_attr_init(&attr)); if (state & TS_JOINABLE) detachstate = PTHREAD_CREATE_JOINABLE; VERIFY0(pthread_attr_setdetachstate(&attr, detachstate)); /* * We allow the default stack size in user space to be specified by * setting the ZFS_STACK_SIZE environment variable. This allows us * the convenience of observing and debugging stack overruns in * user space. Explicitly specified stack sizes will be honored. * The usage of ZFS_STACK_SIZE is discussed further in the * ENVIRONMENT VARIABLES sections of the ztest(1) man page. */ if (stksize == 0) { stkstr = getenv("ZFS_STACK_SIZE"); if (stkstr == NULL) stksize = TS_STACK_MAX; else stksize = MAX(atoi(stkstr), TS_STACK_MIN); } VERIFY3S(stksize, >, 0); stksize = P2ROUNDUP(MAX(stksize, TS_STACK_MIN), PAGESIZE); /* * If this ever fails, it may be because the stack size is not a * multiple of system page size. */ VERIFY0(pthread_attr_setstacksize(&attr, stksize)); VERIFY0(pthread_attr_setguardsize(&attr, PAGESIZE)); VERIFY0(pthread_create(&tid, &attr, (void *(*)(void *))func, arg)); VERIFY0(pthread_attr_destroy(&attr)); return ((void *)(uintptr_t)tid); } /* * ========================================================================= * kstats * ========================================================================= */ /*ARGSUSED*/ kstat_t * kstat_create(const char *module, int instance, const char *name, const char *class, uchar_t type, ulong_t ndata, uchar_t ks_flag) { return (NULL); } /*ARGSUSED*/ void kstat_install(kstat_t *ksp) {} /*ARGSUSED*/ void kstat_delete(kstat_t *ksp) {} /*ARGSUSED*/ void kstat_waitq_enter(kstat_io_t *kiop) {} /*ARGSUSED*/ void kstat_waitq_exit(kstat_io_t *kiop) {} /*ARGSUSED*/ void kstat_runq_enter(kstat_io_t *kiop) {} /*ARGSUSED*/ void kstat_runq_exit(kstat_io_t *kiop) {} /*ARGSUSED*/ void kstat_waitq_to_runq(kstat_io_t *kiop) {} /*ARGSUSED*/ void kstat_runq_back_to_waitq(kstat_io_t *kiop) {} void kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), int (*data)(char *buf, size_t size, void *data), void *(*addr)(kstat_t *ksp, loff_t index)) {} /* * ========================================================================= * mutexes * ========================================================================= */ void mutex_init(kmutex_t *mp, char *name, int type, void *cookie) { VERIFY0(pthread_mutex_init(&mp->m_lock, NULL)); memset(&mp->m_owner, 0, sizeof (pthread_t)); } void mutex_destroy(kmutex_t *mp) { VERIFY0(pthread_mutex_destroy(&mp->m_lock)); } void mutex_enter(kmutex_t *mp) { VERIFY0(pthread_mutex_lock(&mp->m_lock)); mp->m_owner = pthread_self(); } int mutex_tryenter(kmutex_t *mp) { int error; error = pthread_mutex_trylock(&mp->m_lock); if (error == 0) { mp->m_owner = pthread_self(); return (1); } else { VERIFY3S(error, ==, EBUSY); return (0); } } void mutex_exit(kmutex_t *mp) { memset(&mp->m_owner, 0, sizeof (pthread_t)); VERIFY0(pthread_mutex_unlock(&mp->m_lock)); } /* * ========================================================================= * rwlocks * ========================================================================= */ void rw_init(krwlock_t *rwlp, char *name, int type, void *arg) { VERIFY0(pthread_rwlock_init(&rwlp->rw_lock, NULL)); rwlp->rw_readers = 0; rwlp->rw_owner = 0; } void rw_destroy(krwlock_t *rwlp) { VERIFY0(pthread_rwlock_destroy(&rwlp->rw_lock)); } void rw_enter(krwlock_t *rwlp, krw_t rw) { if (rw == RW_READER) { VERIFY0(pthread_rwlock_rdlock(&rwlp->rw_lock)); atomic_inc_uint(&rwlp->rw_readers); } else { VERIFY0(pthread_rwlock_wrlock(&rwlp->rw_lock)); rwlp->rw_owner = pthread_self(); } } void rw_exit(krwlock_t *rwlp) { if (RW_READ_HELD(rwlp)) atomic_dec_uint(&rwlp->rw_readers); else rwlp->rw_owner = 0; VERIFY0(pthread_rwlock_unlock(&rwlp->rw_lock)); } int rw_tryenter(krwlock_t *rwlp, krw_t rw) { int error; if (rw == RW_READER) error = pthread_rwlock_tryrdlock(&rwlp->rw_lock); else error = pthread_rwlock_trywrlock(&rwlp->rw_lock); if (error == 0) { if (rw == RW_READER) atomic_inc_uint(&rwlp->rw_readers); else rwlp->rw_owner = pthread_self(); return (1); } VERIFY3S(error, ==, EBUSY); return (0); } /* ARGSUSED */ uint32_t zone_get_hostid(void *zonep) { /* * We're emulating the system's hostid in userland. */ return (strtoul(hw_serial, NULL, 10)); } int rw_tryupgrade(krwlock_t *rwlp) { return (0); } /* * ========================================================================= * condition variables * ========================================================================= */ void cv_init(kcondvar_t *cv, char *name, int type, void *arg) { VERIFY0(pthread_cond_init(cv, NULL)); } void cv_destroy(kcondvar_t *cv) { VERIFY0(pthread_cond_destroy(cv)); } void cv_wait(kcondvar_t *cv, kmutex_t *mp) { memset(&mp->m_owner, 0, sizeof (pthread_t)); VERIFY0(pthread_cond_wait(cv, &mp->m_lock)); mp->m_owner = pthread_self(); } int cv_wait_sig(kcondvar_t *cv, kmutex_t *mp) { cv_wait(cv, mp); return (1); } int cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) { int error; struct timeval tv; struct timespec ts; clock_t delta; delta = abstime - ddi_get_lbolt(); if (delta <= 0) return (-1); VERIFY(gettimeofday(&tv, NULL) == 0); ts.tv_sec = tv.tv_sec + delta / hz; ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % hz) * (NANOSEC / hz); if (ts.tv_nsec >= NANOSEC) { ts.tv_sec++; ts.tv_nsec -= NANOSEC; } memset(&mp->m_owner, 0, sizeof (pthread_t)); error = pthread_cond_timedwait(cv, &mp->m_lock, &ts); mp->m_owner = pthread_self(); if (error == ETIMEDOUT) return (-1); VERIFY0(error); return (1); } /*ARGSUSED*/ int cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag) { int error; struct timeval tv; struct timespec ts; hrtime_t delta; ASSERT(flag == 0 || flag == CALLOUT_FLAG_ABSOLUTE); delta = tim; if (flag & CALLOUT_FLAG_ABSOLUTE) delta -= gethrtime(); if (delta <= 0) return (-1); VERIFY0(gettimeofday(&tv, NULL)); ts.tv_sec = tv.tv_sec + delta / NANOSEC; ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % NANOSEC); if (ts.tv_nsec >= NANOSEC) { ts.tv_sec++; ts.tv_nsec -= NANOSEC; } memset(&mp->m_owner, 0, sizeof (pthread_t)); error = pthread_cond_timedwait(cv, &mp->m_lock, &ts); mp->m_owner = pthread_self(); if (error == ETIMEDOUT) return (-1); VERIFY0(error); return (1); } void cv_signal(kcondvar_t *cv) { VERIFY0(pthread_cond_signal(cv)); } void cv_broadcast(kcondvar_t *cv) { VERIFY0(pthread_cond_broadcast(cv)); } /* * ========================================================================= * procfs list * ========================================================================= */ void seq_printf(struct seq_file *m, const char *fmt, ...) {} void procfs_list_install(const char *module, const char *name, mode_t mode, procfs_list_t *procfs_list, int (*show)(struct seq_file *f, void *p), int (*show_header)(struct seq_file *f), int (*clear)(procfs_list_t *procfs_list), size_t procfs_list_node_off) { mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&procfs_list->pl_list, procfs_list_node_off + sizeof (procfs_list_node_t), procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); procfs_list->pl_next_id = 1; procfs_list->pl_node_offset = procfs_list_node_off; } void procfs_list_uninstall(procfs_list_t *procfs_list) {} void procfs_list_destroy(procfs_list_t *procfs_list) { ASSERT(list_is_empty(&procfs_list->pl_list)); list_destroy(&procfs_list->pl_list); mutex_destroy(&procfs_list->pl_lock); } #define NODE_ID(procfs_list, obj) \ (((procfs_list_node_t *)(((char *)obj) + \ (procfs_list)->pl_node_offset))->pln_id) void procfs_list_add(procfs_list_t *procfs_list, void *p) { ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); NODE_ID(procfs_list, p) = procfs_list->pl_next_id++; list_insert_tail(&procfs_list->pl_list, p); } /* * ========================================================================= * vnode operations * ========================================================================= */ /* * ========================================================================= * Figure out which debugging statements to print * ========================================================================= */ static char *dprintf_string; static int dprintf_print_all; int dprintf_find_string(const char *string) { char *tmp_str = dprintf_string; int len = strlen(string); /* * Find out if this is a string we want to print. * String format: file1.c,function_name1,file2.c,file3.c */ while (tmp_str != NULL) { if (strncmp(tmp_str, string, len) == 0 && (tmp_str[len] == ',' || tmp_str[len] == '\0')) return (1); tmp_str = strchr(tmp_str, ','); if (tmp_str != NULL) tmp_str++; /* Get rid of , */ } return (0); } void dprintf_setup(int *argc, char **argv) { int i, j; /* * Debugging can be specified two ways: by setting the * environment variable ZFS_DEBUG, or by including a * "debug=..." argument on the command line. The command * line setting overrides the environment variable. */ for (i = 1; i < *argc; i++) { int len = strlen("debug="); /* First look for a command line argument */ if (strncmp("debug=", argv[i], len) == 0) { dprintf_string = argv[i] + len; /* Remove from args */ for (j = i; j < *argc; j++) argv[j] = argv[j+1]; argv[j] = NULL; (*argc)--; } } if (dprintf_string == NULL) { /* Look for ZFS_DEBUG environment variable */ dprintf_string = getenv("ZFS_DEBUG"); } /* * Are we just turning on all debugging? */ if (dprintf_find_string("on")) dprintf_print_all = 1; if (dprintf_string != NULL) zfs_flags |= ZFS_DEBUG_DPRINTF; } /* * ========================================================================= * debug printfs * ========================================================================= */ void __dprintf(boolean_t dprint, const char *file, const char *func, int line, const char *fmt, ...) { const char *newfile; va_list adx; /* * Get rid of annoying "../common/" prefix to filename. */ newfile = strrchr(file, '/'); if (newfile != NULL) { newfile = newfile + 1; /* Get rid of leading / */ } else { newfile = file; } if (dprint) { /* dprintf messages are printed immediately */ if (!dprintf_print_all && !dprintf_find_string(newfile) && !dprintf_find_string(func)) return; /* Print out just the function name if requested */ flockfile(stdout); if (dprintf_find_string("pid")) (void) printf("%d ", getpid()); if (dprintf_find_string("tid")) (void) printf("%ju ", (uintmax_t)(uintptr_t)pthread_self()); if (dprintf_find_string("cpu")) (void) printf("%u ", getcpuid()); if (dprintf_find_string("time")) (void) printf("%llu ", gethrtime()); if (dprintf_find_string("long")) (void) printf("%s, line %d: ", newfile, line); (void) printf("dprintf: %s: ", func); va_start(adx, fmt); (void) vprintf(fmt, adx); va_end(adx); funlockfile(stdout); } else { /* zfs_dbgmsg is logged for dumping later */ size_t size; char *buf; int i; size = 1024; buf = umem_alloc(size, UMEM_NOFAIL); i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func); if (i < size) { va_start(adx, fmt); (void) vsnprintf(buf + i, size - i, fmt, adx); va_end(adx); } __zfs_dbgmsg(buf); umem_free(buf, size); } } /* * ========================================================================= * cmn_err() and panic() * ========================================================================= */ static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" }; static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" }; void vpanic(const char *fmt, va_list adx) { (void) fprintf(stderr, "error: "); (void) vfprintf(stderr, fmt, adx); (void) fprintf(stderr, "\n"); abort(); /* think of it as a "user-level crash dump" */ } void panic(const char *fmt, ...) { va_list adx; va_start(adx, fmt); vpanic(fmt, adx); va_end(adx); } void vcmn_err(int ce, const char *fmt, va_list adx) { if (ce == CE_PANIC) vpanic(fmt, adx); if (ce != CE_NOTE) { /* suppress noise in userland stress testing */ (void) fprintf(stderr, "%s", ce_prefix[ce]); (void) vfprintf(stderr, fmt, adx); (void) fprintf(stderr, "%s", ce_suffix[ce]); } } /*PRINTFLIKE2*/ void cmn_err(int ce, const char *fmt, ...) { va_list adx; va_start(adx, fmt); vcmn_err(ce, fmt, adx); va_end(adx); } /* * ========================================================================= * misc routines * ========================================================================= */ void delay(clock_t ticks) { (void) poll(0, 0, ticks * (1000 / hz)); } /* * Find highest one bit set. * Returns bit number + 1 of highest bit that is set, otherwise returns 0. * The __builtin_clzll() function is supported by both GCC and Clang. */ int highbit64(uint64_t i) { if (i == 0) return (0); return (NBBY * sizeof (uint64_t) - __builtin_clzll(i)); } /* * Find lowest one bit set. * Returns bit number + 1 of lowest bit that is set, otherwise returns 0. * The __builtin_ffsll() function is supported by both GCC and Clang. */ int lowbit64(uint64_t i) { if (i == 0) return (0); return (__builtin_ffsll(i)); } char *random_path = "/dev/random"; char *urandom_path = "/dev/urandom"; static int random_fd = -1, urandom_fd = -1; void random_init(void) { VERIFY((random_fd = open(random_path, O_RDONLY)) != -1); VERIFY((urandom_fd = open(urandom_path, O_RDONLY)) != -1); } void random_fini(void) { close(random_fd); close(urandom_fd); random_fd = -1; urandom_fd = -1; } static int random_get_bytes_common(uint8_t *ptr, size_t len, int fd) { size_t resid = len; ssize_t bytes; ASSERT(fd != -1); while (resid != 0) { bytes = read(fd, ptr, resid); ASSERT3S(bytes, >=, 0); ptr += bytes; resid -= bytes; } return (0); } int random_get_bytes(uint8_t *ptr, size_t len) { return (random_get_bytes_common(ptr, len, random_fd)); } int random_get_pseudo_bytes(uint8_t *ptr, size_t len) { return (random_get_bytes_common(ptr, len, urandom_fd)); } int ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result) { char *end; *result = strtoul(hw_serial, &end, base); if (*result == 0) return (errno); return (0); } int ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result) { char *end; *result = strtoull(str, &end, base); if (*result == 0) return (errno); return (0); } utsname_t * utsname(void) { return (&hw_utsname); } /* * ========================================================================= * kernel emulation setup & teardown * ========================================================================= */ static int umem_out_of_memory(void) { char errmsg[] = "out of memory -- generating core dump\n"; (void) fprintf(stderr, "%s", errmsg); abort(); return (0); } void kernel_init(int mode) { extern uint_t rrw_tsd_key; umem_nofail_callback(umem_out_of_memory); physmem = sysconf(_SC_PHYS_PAGES); dprintf("physmem = %llu pages (%.2f GB)\n", physmem, (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30)); (void) snprintf(hw_serial, sizeof (hw_serial), "%ld", (mode & SPA_MODE_WRITE) ? get_system_hostid() : 0); random_init(); VERIFY0(uname(&hw_utsname)); system_taskq_init(); icp_init(); zstd_init(); spa_init((spa_mode_t)mode); fletcher_4_init(); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); } void kernel_fini(void) { fletcher_4_fini(); spa_fini(); zstd_fini(); icp_fini(); system_taskq_fini(); random_fini(); } uid_t crgetuid(cred_t *cr) { return (0); } uid_t crgetruid(cred_t *cr) { return (0); } gid_t crgetgid(cred_t *cr) { return (0); } int crgetngroups(cred_t *cr) { return (0); } gid_t * crgetgroups(cred_t *cr) { return (NULL); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { return (0); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { return (0); } int secpolicy_zfs(const cred_t *cr) { return (0); } int secpolicy_zfs_proc(const cred_t *cr, proc_t *proc) { return (0); } ksiddomain_t * ksid_lookupdomain(const char *dom) { ksiddomain_t *kd; kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL); kd->kd_name = spa_strdup(dom); return (kd); } void ksiddomain_rele(ksiddomain_t *ksid) { spa_strfree(ksid->kd_name); umem_free(ksid, sizeof (ksiddomain_t)); } char * kmem_vasprintf(const char *fmt, va_list adx) { char *buf = NULL; va_list adx_copy; va_copy(adx_copy, adx); VERIFY(vasprintf(&buf, fmt, adx_copy) != -1); va_end(adx_copy); return (buf); } char * kmem_asprintf(const char *fmt, ...) { char *buf = NULL; va_list adx; va_start(adx, fmt); VERIFY(vasprintf(&buf, fmt, adx) != -1); va_end(adx); return (buf); } /* ARGSUSED */ int zfs_onexit_fd_hold(int fd, minor_t *minorp) { *minorp = 0; return (0); } /* ARGSUSED */ void zfs_onexit_fd_rele(int fd) { } /* ARGSUSED */ int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, uint64_t *action_handle) { return (0); } fstrans_cookie_t spl_fstrans_mark(void) { return ((fstrans_cookie_t)0); } void spl_fstrans_unmark(fstrans_cookie_t cookie) { } int __spl_pf_fstrans_check(void) { return (0); } int kmem_cache_reap_active(void) { return (0); } void *zvol_tag = "zvol_tag"; void zvol_create_minor(const char *name) { } void zvol_create_minors_recursive(const char *name) { } void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) { } void zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname, boolean_t async) { } /* * Open file * * path - fully qualified path to file * flags - file attributes O_READ / O_WRITE / O_EXCL * fpp - pointer to return file pointer * * Returns 0 on success underlying error on failure. */ int zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) { int fd = -1; int dump_fd = -1; int err; int old_umask = 0; zfs_file_t *fp; struct stat64 st; if (!(flags & O_CREAT) && stat64(path, &st) == -1) return (errno); if (!(flags & O_CREAT) && S_ISBLK(st.st_mode)) flags |= O_DIRECT; if (flags & O_CREAT) old_umask = umask(0); fd = open64(path, flags, mode); if (fd == -1) return (errno); if (flags & O_CREAT) (void) umask(old_umask); if (vn_dumpdir != NULL) { char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL); char *inpath = basename((char *)(uintptr_t)path); (void) snprintf(dumppath, MAXPATHLEN, "%s/%s", vn_dumpdir, inpath); dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666); umem_free(dumppath, MAXPATHLEN); if (dump_fd == -1) { err = errno; close(fd); return (err); } } else { dump_fd = -1; } (void) fcntl(fd, F_SETFD, FD_CLOEXEC); fp = umem_zalloc(sizeof (zfs_file_t), UMEM_NOFAIL); fp->f_fd = fd; fp->f_dump_fd = dump_fd; *fpp = fp; return (0); } void zfs_file_close(zfs_file_t *fp) { close(fp->f_fd); if (fp->f_dump_fd != -1) close(fp->f_dump_fd); umem_free(fp, sizeof (zfs_file_t)); } /* * Stateful write - use os internal file pointer to determine where to * write and update on successful completion. * * fp - pointer to file (pipe, socket, etc) to write to * buf - buffer to write * count - # of bytes to write * resid - pointer to count of unwritten bytes (if short write) * * Returns 0 on success errno on failure. */ int zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) { ssize_t rc; rc = write(fp->f_fd, buf, count); if (rc < 0) return (errno); if (resid) { *resid = count - rc; } else if (rc != count) { return (EIO); } return (0); } /* * Stateless write - os internal file pointer is not updated. * * fp - pointer to file (pipe, socket, etc) to write to * buf - buffer to write * count - # of bytes to write * off - file offset to write to (only valid for seekable types) * resid - pointer to count of unwritten bytes * * Returns 0 on success errno on failure. */ int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t pos, ssize_t *resid) { ssize_t rc, split, done; int sectors; /* * To simulate partial disk writes, we split writes into two * system calls so that the process can be killed in between. * This is used by ztest to simulate realistic failure modes. */ sectors = count >> SPA_MINBLOCKSHIFT; split = (sectors > 0 ? rand() % sectors : 0) << SPA_MINBLOCKSHIFT; rc = pwrite64(fp->f_fd, buf, split, pos); if (rc != -1) { done = rc; rc = pwrite64(fp->f_fd, (char *)buf + split, count - split, pos + split); } #ifdef __linux__ if (rc == -1 && errno == EINVAL) { /* * Under Linux, this most likely means an alignment issue * (memory or disk) due to O_DIRECT, so we abort() in order * to catch the offender. */ abort(); } #endif if (rc < 0) return (errno); done += rc; if (resid) { *resid = count - done; } else if (done != count) { return (EIO); } return (0); } /* * Stateful read - use os internal file pointer to determine where to * read and update on successful completion. * * fp - pointer to file (pipe, socket, etc) to read from * buf - buffer to write * count - # of bytes to read * resid - pointer to count of unread bytes (if short read) * * Returns 0 on success errno on failure. */ int zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) { int rc; rc = read(fp->f_fd, buf, count); if (rc < 0) return (errno); if (resid) { *resid = count - rc; } else if (rc != count) { return (EIO); } return (0); } /* * Stateless read - os internal file pointer is not updated. * * fp - pointer to file (pipe, socket, etc) to read from * buf - buffer to write * count - # of bytes to write * off - file offset to read from (only valid for seekable types) * resid - pointer to count of unwritten bytes (if short write) * * Returns 0 on success errno on failure. */ int zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, ssize_t *resid) { ssize_t rc; rc = pread64(fp->f_fd, buf, count, off); if (rc < 0) { #ifdef __linux__ /* * Under Linux, this most likely means an alignment issue * (memory or disk) due to O_DIRECT, so we abort() in order to * catch the offender. */ if (errno == EINVAL) abort(); #endif return (errno); } if (fp->f_dump_fd != -1) { int status; status = pwrite64(fp->f_dump_fd, buf, rc, off); ASSERT(status != -1); } if (resid) { *resid = count - rc; } else if (rc != count) { return (EIO); } return (0); } /* * lseek - set / get file pointer * * fp - pointer to file (pipe, socket, etc) to read from * offp - value to seek to, returns current value plus passed offset * whence - see man pages for standard lseek whence values * * Returns 0 on success errno on failure (ESPIPE for non seekable types) */ int zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) { loff_t rc; rc = lseek(fp->f_fd, *offp, whence); if (rc < 0) return (errno); *offp = rc; return (0); } /* * Get file attributes * * filp - file pointer * zfattr - pointer to file attr structure * * Currently only used for fetching size and file mode * * Returns 0 on success or error code of underlying getattr call on failure. */ int zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr) { struct stat64 st; if (fstat64_blk(fp->f_fd, &st) == -1) return (errno); zfattr->zfa_size = st.st_size; zfattr->zfa_mode = st.st_mode; return (0); } /* * Sync file to disk * * filp - file pointer * flags - O_SYNC and or O_DSYNC * * Returns 0 on success or error code of underlying sync call on failure. */ int zfs_file_fsync(zfs_file_t *fp, int flags) { int rc; rc = fsync(fp->f_fd); if (rc < 0) return (errno); return (0); } /* * fallocate - allocate or free space on disk * * fp - file pointer * mode (non-standard options for hole punching etc) * offset - offset to start allocating or freeing from * len - length to free / allocate * * OPTIONAL */ int zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len) { #ifdef __linux__ return (fallocate(fp->f_fd, mode, offset, len)); #else return (EOPNOTSUPP); #endif } /* * Request current file pointer offset * * fp - pointer to file * * Returns current file offset. */ loff_t zfs_file_off(zfs_file_t *fp) { return (lseek(fp->f_fd, SEEK_CUR, 0)); } /* * unlink file * * path - fully qualified file path * * Returns 0 on success. * * OPTIONAL */ int zfs_file_unlink(const char *path) { return (remove(path)); } /* * Get reference to file pointer * * fd - input file descriptor * fpp - pointer to file pointer * * Returns 0 on success EBADF on failure. * Unsupported in user space. */ int zfs_file_get(int fd, zfs_file_t **fpp) { abort(); return (EOPNOTSUPP); } /* * Drop reference to file pointer * * fd - input file descriptor * * Unsupported in user space. */ void zfs_file_put(int fd) { abort(); +} + +void +zfsvfs_update_fromname(const char *oldname, const char *newname) +{ } Index: vendor-sys/openzfs/dist/man/man8/zfs-rename.8 =================================================================== --- vendor-sys/openzfs/dist/man/man8/zfs-rename.8 (revision 365339) +++ vendor-sys/openzfs/dist/man/man8/zfs-rename.8 (revision 365340) @@ -1,91 +1,119 @@ .\" .\" CDDL HEADER START .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" CDDL HEADER END .\" .\" .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. .\" Copyright 2011 Joshua M. Clulow .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" Copyright (c) 2014 by Adam Stevko. All rights reserved. .\" Copyright (c) 2014 Integros [integros.com] .\" Copyright 2019 Richard Laager. All rights reserved. .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" .Dd June 30, 2019 .Dt ZFS-RENAME 8 .Os .Sh NAME .Nm zfs Ns Pf - Cm rename .Nd Renames the given dataset (filesystem or snapshot). .Sh SYNOPSIS .Nm .Cm rename .Op Fl f .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm .Cm rename -.Op Fl fp +.Fl p +.Op Fl f .Ar filesystem Ns | Ns Ar volume .Ar filesystem Ns | Ns Ar volume +.Nm +.Cm rename +.Fl u +.Op Fl f +.Ar filesystem +.Ar filesystem .Sh DESCRIPTION .Bl -tag -width "" .It Xo .Nm .Cm rename .Op Fl f .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc .It Xo .Nm .Cm rename -.Op Fl fp +.Fl p +.Op Fl f .Ar filesystem Ns | Ns Ar volume .Ar filesystem Ns | Ns Ar volume .Xc +.It Xo +.Nm +.Cm rename +.Fl u +.Op Fl f +.Ar filesystem +.Ar filesystem +.Xc Renames the given dataset. The new target can be located anywhere in the ZFS hierarchy, with the exception of snapshots. Snapshots can only be renamed within the parent file system or volume. When renaming a snapshot, the parent file system of the snapshot does not need to be specified as part of the second argument. Renamed file systems can inherit new mount points, in which case they are unmounted and remounted at the new mount point. .Bl -tag -width "-a" .It Fl f -Force unmount any filesystems that need to be unmounted in the process. +Force unmount any file systems that need to be unmounted in the process. +This flag has no effect if used together with the +.Fl u +flag. .It Fl p Creates all the nonexistent parent datasets. Datasets created in this manner are automatically mounted according to the .Sy mountpoint property inherited from their parent. +.It Fl u +Do not remount file systems during rename. +If a file system's +.Sy mountpoint +property is set to +.Sy legacy +or +.Sy none , +the file system is not unmounted even if this option is not given. .El .It Xo .Nm .Cm rename .Fl r .Ar snapshot Ar snapshot .Xc Recursively rename the snapshots of all descendent datasets. Snapshots are the only dataset that can be renamed recursively. .El Index: vendor-sys/openzfs/dist/module/avl/avl.c =================================================================== --- vendor-sys/openzfs/dist/module/avl/avl.c (revision 365339) +++ vendor-sys/openzfs/dist/module/avl/avl.c (revision 365340) @@ -1,1093 +1,1096 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2015 by Delphix. All rights reserved. */ /* * AVL - generic AVL tree implementation for kernel use * * A complete description of AVL trees can be found in many CS textbooks. * * Here is a very brief overview. An AVL tree is a binary search tree that is * almost perfectly balanced. By "almost" perfectly balanced, we mean that at * any given node, the left and right subtrees are allowed to differ in height * by at most 1 level. * * This relaxation from a perfectly balanced binary tree allows doing * insertion and deletion relatively efficiently. Searching the tree is * still a fast operation, roughly O(log(N)). * * The key to insertion and deletion is a set of tree manipulations called * rotations, which bring unbalanced subtrees back into the semi-balanced state. * * This implementation of AVL trees has the following peculiarities: * * - The AVL specific data structures are physically embedded as fields * in the "using" data structures. To maintain generality the code * must constantly translate between "avl_node_t *" and containing * data structure "void *"s by adding/subtracting the avl_offset. * * - Since the AVL data is always embedded in other structures, there is * no locking or memory allocation in the AVL routines. This must be * provided for by the enclosing data structure's semantics. Typically, * avl_insert()/_add()/_remove()/avl_insert_here() require some kind of * exclusive write lock. Other operations require a read lock. * * - The implementation uses iteration instead of explicit recursion, * since it is intended to run on limited size kernel stacks. Since * there is no recursion stack present to move "up" in the tree, * there is an explicit "parent" link in the avl_node_t. * * - The left/right children pointers of a node are in an array. * In the code, variables (instead of constants) are used to represent * left and right indices. The implementation is written as if it only * dealt with left handed manipulations. By changing the value assigned * to "left", the code also works for right handed trees. The * following variables/terms are frequently used: * * int left; // 0 when dealing with left children, * // 1 for dealing with right children * * int left_heavy; // -1 when left subtree is taller at some node, * // +1 when right subtree is taller * * int right; // will be the opposite of left (0 or 1) * int right_heavy;// will be the opposite of left_heavy (-1 or 1) * * int direction; // 0 for "<" (ie. left child); 1 for ">" (right) * * Though it is a little more confusing to read the code, the approach * allows using half as much code (and hence cache footprint) for tree * manipulations and eliminates many conditional branches. * * - The avl_index_t is an opaque "cookie" used to find nodes at or * adjacent to where a new value would be inserted in the tree. The value * is a modified "avl_node_t *". The bottom bit (normally 0 for a * pointer) is set to indicate if that the new node has a value greater * than the value of the indicated "avl_node_t *". * * Note - in addition to userland (e.g. libavl and libutil) and the kernel * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module, * which each have their own compilation environments and subsequent * requirements. Each of these environments must be considered when adding * dependencies from avl.c. + * + * Link to Illumos.org for more information on avl function: + * [1] https://illumos.org/man/9f/avl */ #include #include #include #include #include #include /* * Small arrays to translate between balance (or diff) values and child indices. * * Code that deals with binary tree data structures will randomly use * left and right children when examining a tree. C "if()" statements * which evaluate randomly suffer from very poor hardware branch prediction. * In this code we avoid some of the branch mispredictions by using the * following translation arrays. They replace random branches with an * additional memory reference. Since the translation arrays are both very * small the data should remain efficiently in cache. */ static const int avl_child2balance[2] = {-1, 1}; static const int avl_balance2child[] = {0, 0, 1}; /* * Walk from one node to the previous valued node (ie. an infix walk * towards the left). At any given node we do one of 2 things: * * - If there is a left child, go to it, then to it's rightmost descendant. * * - otherwise we return through parent nodes until we've come from a right * child. * * Return Value: * NULL - if at the end of the nodes * otherwise next node */ void * avl_walk(avl_tree_t *tree, void *oldnode, int left) { size_t off = tree->avl_offset; avl_node_t *node = AVL_DATA2NODE(oldnode, off); int right = 1 - left; int was_child; /* * nowhere to walk to if tree is empty */ if (node == NULL) return (NULL); /* * Visit the previous valued node. There are two possibilities: * * If this node has a left child, go down one left, then all * the way right. */ if (node->avl_child[left] != NULL) { for (node = node->avl_child[left]; node->avl_child[right] != NULL; node = node->avl_child[right]) ; /* * Otherwise, return through left children as far as we can. */ } else { for (;;) { was_child = AVL_XCHILD(node); node = AVL_XPARENT(node); if (node == NULL) return (NULL); if (was_child == right) break; } } return (AVL_NODE2DATA(node, off)); } /* * Return the lowest valued node in a tree or NULL. * (leftmost child from root of tree) */ void * avl_first(avl_tree_t *tree) { avl_node_t *node; avl_node_t *prev = NULL; size_t off = tree->avl_offset; for (node = tree->avl_root; node != NULL; node = node->avl_child[0]) prev = node; if (prev != NULL) return (AVL_NODE2DATA(prev, off)); return (NULL); } /* * Return the highest valued node in a tree or NULL. * (rightmost child from root of tree) */ void * avl_last(avl_tree_t *tree) { avl_node_t *node; avl_node_t *prev = NULL; size_t off = tree->avl_offset; for (node = tree->avl_root; node != NULL; node = node->avl_child[1]) prev = node; if (prev != NULL) return (AVL_NODE2DATA(prev, off)); return (NULL); } /* * Access the node immediately before or after an insertion point. * * "avl_index_t" is a (avl_node_t *) with the bottom bit indicating a child * * Return value: * NULL: no node in the given direction * "void *" of the found tree node */ void * avl_nearest(avl_tree_t *tree, avl_index_t where, int direction) { int child = AVL_INDEX2CHILD(where); avl_node_t *node = AVL_INDEX2NODE(where); void *data; size_t off = tree->avl_offset; if (node == NULL) { ASSERT(tree->avl_root == NULL); return (NULL); } data = AVL_NODE2DATA(node, off); if (child != direction) return (data); return (avl_walk(tree, data, direction)); } /* * Search for the node which contains "value". The algorithm is a * simple binary tree search. * * return value: * NULL: the value is not in the AVL tree * *where (if not NULL) is set to indicate the insertion point * "void *" of the found tree node */ void * avl_find(avl_tree_t *tree, const void *value, avl_index_t *where) { avl_node_t *node; avl_node_t *prev = NULL; int child = 0; int diff; size_t off = tree->avl_offset; for (node = tree->avl_root; node != NULL; node = node->avl_child[child]) { prev = node; diff = tree->avl_compar(value, AVL_NODE2DATA(node, off)); ASSERT(-1 <= diff && diff <= 1); if (diff == 0) { #ifdef ZFS_DEBUG if (where != NULL) *where = 0; #endif return (AVL_NODE2DATA(node, off)); } child = avl_balance2child[1 + diff]; } if (where != NULL) *where = AVL_MKINDEX(prev, child); return (NULL); } /* * Perform a rotation to restore balance at the subtree given by depth. * * This routine is used by both insertion and deletion. The return value * indicates: * 0 : subtree did not change height * !0 : subtree was reduced in height * * The code is written as if handling left rotations, right rotations are * symmetric and handled by swapping values of variables right/left[_heavy] * * On input balance is the "new" balance at "node". This value is either * -2 or +2. */ static int avl_rotation(avl_tree_t *tree, avl_node_t *node, int balance) { int left = !(balance < 0); /* when balance = -2, left will be 0 */ int right = 1 - left; int left_heavy = balance >> 1; int right_heavy = -left_heavy; avl_node_t *parent = AVL_XPARENT(node); avl_node_t *child = node->avl_child[left]; avl_node_t *cright; avl_node_t *gchild; avl_node_t *gright; avl_node_t *gleft; int which_child = AVL_XCHILD(node); int child_bal = AVL_XBALANCE(child); /* BEGIN CSTYLED */ /* * case 1 : node is overly left heavy, the left child is balanced or * also left heavy. This requires the following rotation. * * (node bal:-2) * / \ * / \ * (child bal:0 or -1) * / \ * / \ * cright * * becomes: * * (child bal:1 or 0) * / \ * / \ * (node bal:-1 or 0) * / \ * / \ * cright * * we detect this situation by noting that child's balance is not * right_heavy. */ /* END CSTYLED */ if (child_bal != right_heavy) { /* * compute new balance of nodes * * If child used to be left heavy (now balanced) we reduced * the height of this sub-tree -- used in "return...;" below */ child_bal += right_heavy; /* adjust towards right */ /* * move "cright" to be node's left child */ cright = child->avl_child[right]; node->avl_child[left] = cright; if (cright != NULL) { AVL_SETPARENT(cright, node); AVL_SETCHILD(cright, left); } /* * move node to be child's right child */ child->avl_child[right] = node; AVL_SETBALANCE(node, -child_bal); AVL_SETCHILD(node, right); AVL_SETPARENT(node, child); /* * update the pointer into this subtree */ AVL_SETBALANCE(child, child_bal); AVL_SETCHILD(child, which_child); AVL_SETPARENT(child, parent); if (parent != NULL) parent->avl_child[which_child] = child; else tree->avl_root = child; return (child_bal == 0); } /* BEGIN CSTYLED */ /* * case 2 : When node is left heavy, but child is right heavy we use * a different rotation. * * (node b:-2) * / \ * / \ * / \ * (child b:+1) * / \ * / \ * (gchild b: != 0) * / \ * / \ * gleft gright * * becomes: * * (gchild b:0) * / \ * / \ * / \ * (child b:?) (node b:?) * / \ / \ * / \ / \ * gleft gright * * computing the new balances is more complicated. As an example: * if gchild was right_heavy, then child is now left heavy * else it is balanced */ /* END CSTYLED */ gchild = child->avl_child[right]; gleft = gchild->avl_child[left]; gright = gchild->avl_child[right]; /* * move gright to left child of node and * * move gleft to right child of node */ node->avl_child[left] = gright; if (gright != NULL) { AVL_SETPARENT(gright, node); AVL_SETCHILD(gright, left); } child->avl_child[right] = gleft; if (gleft != NULL) { AVL_SETPARENT(gleft, child); AVL_SETCHILD(gleft, right); } /* * move child to left child of gchild and * * move node to right child of gchild and * * fixup parent of all this to point to gchild */ balance = AVL_XBALANCE(gchild); gchild->avl_child[left] = child; AVL_SETBALANCE(child, (balance == right_heavy ? left_heavy : 0)); AVL_SETPARENT(child, gchild); AVL_SETCHILD(child, left); gchild->avl_child[right] = node; AVL_SETBALANCE(node, (balance == left_heavy ? right_heavy : 0)); AVL_SETPARENT(node, gchild); AVL_SETCHILD(node, right); AVL_SETBALANCE(gchild, 0); AVL_SETPARENT(gchild, parent); AVL_SETCHILD(gchild, which_child); if (parent != NULL) parent->avl_child[which_child] = gchild; else tree->avl_root = gchild; return (1); /* the new tree is always shorter */ } /* * Insert a new node into an AVL tree at the specified (from avl_find()) place. * * Newly inserted nodes are always leaf nodes in the tree, since avl_find() * searches out to the leaf positions. The avl_index_t indicates the node * which will be the parent of the new node. * * After the node is inserted, a single rotation further up the tree may * be necessary to maintain an acceptable AVL balance. */ void avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where) { avl_node_t *node; avl_node_t *parent = AVL_INDEX2NODE(where); int old_balance; int new_balance; int which_child = AVL_INDEX2CHILD(where); size_t off = tree->avl_offset; ASSERT(tree); #ifdef _LP64 ASSERT(((uintptr_t)new_data & 0x7) == 0); #endif node = AVL_DATA2NODE(new_data, off); /* * First, add the node to the tree at the indicated position. */ ++tree->avl_numnodes; node->avl_child[0] = NULL; node->avl_child[1] = NULL; AVL_SETCHILD(node, which_child); AVL_SETBALANCE(node, 0); AVL_SETPARENT(node, parent); if (parent != NULL) { ASSERT(parent->avl_child[which_child] == NULL); parent->avl_child[which_child] = node; } else { ASSERT(tree->avl_root == NULL); tree->avl_root = node; } /* * Now, back up the tree modifying the balance of all nodes above the * insertion point. If we get to a highly unbalanced ancestor, we * need to do a rotation. If we back out of the tree we are done. * If we brought any subtree into perfect balance (0), we are also done. */ for (;;) { node = parent; if (node == NULL) return; /* * Compute the new balance */ old_balance = AVL_XBALANCE(node); new_balance = old_balance + avl_child2balance[which_child]; /* * If we introduced equal balance, then we are done immediately */ if (new_balance == 0) { AVL_SETBALANCE(node, 0); return; } /* * If both old and new are not zero we went * from -1 to -2 balance, do a rotation. */ if (old_balance != 0) break; AVL_SETBALANCE(node, new_balance); parent = AVL_XPARENT(node); which_child = AVL_XCHILD(node); } /* * perform a rotation to fix the tree and return */ (void) avl_rotation(tree, node, new_balance); } /* * Insert "new_data" in "tree" in the given "direction" either after or * before (AVL_AFTER, AVL_BEFORE) the data "here". * * Insertions can only be done at empty leaf points in the tree, therefore * if the given child of the node is already present we move to either * the AVL_PREV or AVL_NEXT and reverse the insertion direction. Since * every other node in the tree is a leaf, this always works. * * To help developers using this interface, we assert that the new node * is correctly ordered at every step of the way in DEBUG kernels. */ void avl_insert_here( avl_tree_t *tree, void *new_data, void *here, int direction) { avl_node_t *node; int child = direction; /* rely on AVL_BEFORE == 0, AVL_AFTER == 1 */ #ifdef ZFS_DEBUG int diff; #endif ASSERT(tree != NULL); ASSERT(new_data != NULL); ASSERT(here != NULL); ASSERT(direction == AVL_BEFORE || direction == AVL_AFTER); /* * If corresponding child of node is not NULL, go to the neighboring * node and reverse the insertion direction. */ node = AVL_DATA2NODE(here, tree->avl_offset); #ifdef ZFS_DEBUG diff = tree->avl_compar(new_data, here); ASSERT(-1 <= diff && diff <= 1); ASSERT(diff != 0); ASSERT(diff > 0 ? child == 1 : child == 0); #endif if (node->avl_child[child] != NULL) { node = node->avl_child[child]; child = 1 - child; while (node->avl_child[child] != NULL) { #ifdef ZFS_DEBUG diff = tree->avl_compar(new_data, AVL_NODE2DATA(node, tree->avl_offset)); ASSERT(-1 <= diff && diff <= 1); ASSERT(diff != 0); ASSERT(diff > 0 ? child == 1 : child == 0); #endif node = node->avl_child[child]; } #ifdef ZFS_DEBUG diff = tree->avl_compar(new_data, AVL_NODE2DATA(node, tree->avl_offset)); ASSERT(-1 <= diff && diff <= 1); ASSERT(diff != 0); ASSERT(diff > 0 ? child == 1 : child == 0); #endif } ASSERT(node->avl_child[child] == NULL); avl_insert(tree, new_data, AVL_MKINDEX(node, child)); } /* * Add a new node to an AVL tree. Strictly enforce that no duplicates can * be added to the tree with a VERIFY which is enabled for non-DEBUG builds. */ void avl_add(avl_tree_t *tree, void *new_node) { avl_index_t where = 0; VERIFY(avl_find(tree, new_node, &where) == NULL); avl_insert(tree, new_node, where); } /* * Delete a node from the AVL tree. Deletion is similar to insertion, but * with 2 complications. * * First, we may be deleting an interior node. Consider the following subtree: * * d c c * / \ / \ / \ * b e b e b e * / \ / \ / * a c a a * * When we are deleting node (d), we find and bring up an adjacent valued leaf * node, say (c), to take the interior node's place. In the code this is * handled by temporarily swapping (d) and (c) in the tree and then using * common code to delete (d) from the leaf position. * * Secondly, an interior deletion from a deep tree may require more than one * rotation to fix the balance. This is handled by moving up the tree through * parents and applying rotations as needed. The return value from * avl_rotation() is used to detect when a subtree did not change overall * height due to a rotation. */ void avl_remove(avl_tree_t *tree, void *data) { avl_node_t *delete; avl_node_t *parent; avl_node_t *node; avl_node_t tmp; int old_balance; int new_balance; int left; int right; int which_child; size_t off = tree->avl_offset; ASSERT(tree); delete = AVL_DATA2NODE(data, off); /* * Deletion is easiest with a node that has at most 1 child. * We swap a node with 2 children with a sequentially valued * neighbor node. That node will have at most 1 child. Note this * has no effect on the ordering of the remaining nodes. * * As an optimization, we choose the greater neighbor if the tree * is right heavy, otherwise the left neighbor. This reduces the * number of rotations needed. */ if (delete->avl_child[0] != NULL && delete->avl_child[1] != NULL) { /* * choose node to swap from whichever side is taller */ old_balance = AVL_XBALANCE(delete); left = avl_balance2child[old_balance + 1]; right = 1 - left; /* * get to the previous value'd node * (down 1 left, as far as possible right) */ for (node = delete->avl_child[left]; node->avl_child[right] != NULL; node = node->avl_child[right]) ; /* * create a temp placeholder for 'node' * move 'node' to delete's spot in the tree */ tmp = *node; *node = *delete; if (node->avl_child[left] == node) node->avl_child[left] = &tmp; parent = AVL_XPARENT(node); if (parent != NULL) parent->avl_child[AVL_XCHILD(node)] = node; else tree->avl_root = node; AVL_SETPARENT(node->avl_child[left], node); AVL_SETPARENT(node->avl_child[right], node); /* * Put tmp where node used to be (just temporary). * It always has a parent and at most 1 child. */ delete = &tmp; parent = AVL_XPARENT(delete); parent->avl_child[AVL_XCHILD(delete)] = delete; which_child = (delete->avl_child[1] != 0); if (delete->avl_child[which_child] != NULL) AVL_SETPARENT(delete->avl_child[which_child], delete); } /* * Here we know "delete" is at least partially a leaf node. It can * be easily removed from the tree. */ ASSERT(tree->avl_numnodes > 0); --tree->avl_numnodes; parent = AVL_XPARENT(delete); which_child = AVL_XCHILD(delete); if (delete->avl_child[0] != NULL) node = delete->avl_child[0]; else node = delete->avl_child[1]; /* * Connect parent directly to node (leaving out delete). */ if (node != NULL) { AVL_SETPARENT(node, parent); AVL_SETCHILD(node, which_child); } if (parent == NULL) { tree->avl_root = node; return; } parent->avl_child[which_child] = node; /* * Since the subtree is now shorter, begin adjusting parent balances * and performing any needed rotations. */ do { /* * Move up the tree and adjust the balance * * Capture the parent and which_child values for the next * iteration before any rotations occur. */ node = parent; old_balance = AVL_XBALANCE(node); new_balance = old_balance - avl_child2balance[which_child]; parent = AVL_XPARENT(node); which_child = AVL_XCHILD(node); /* * If a node was in perfect balance but isn't anymore then * we can stop, since the height didn't change above this point * due to a deletion. */ if (old_balance == 0) { AVL_SETBALANCE(node, new_balance); break; } /* * If the new balance is zero, we don't need to rotate * else * need a rotation to fix the balance. * If the rotation doesn't change the height * of the sub-tree we have finished adjusting. */ if (new_balance == 0) AVL_SETBALANCE(node, new_balance); else if (!avl_rotation(tree, node, new_balance)) break; } while (parent != NULL); } #define AVL_REINSERT(tree, obj) \ avl_remove((tree), (obj)); \ avl_add((tree), (obj)) boolean_t avl_update_lt(avl_tree_t *t, void *obj) { void *neighbor; ASSERT(((neighbor = AVL_NEXT(t, obj)) == NULL) || (t->avl_compar(obj, neighbor) <= 0)); neighbor = AVL_PREV(t, obj); if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) { AVL_REINSERT(t, obj); return (B_TRUE); } return (B_FALSE); } boolean_t avl_update_gt(avl_tree_t *t, void *obj) { void *neighbor; ASSERT(((neighbor = AVL_PREV(t, obj)) == NULL) || (t->avl_compar(obj, neighbor) >= 0)); neighbor = AVL_NEXT(t, obj); if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) { AVL_REINSERT(t, obj); return (B_TRUE); } return (B_FALSE); } boolean_t avl_update(avl_tree_t *t, void *obj) { void *neighbor; neighbor = AVL_PREV(t, obj); if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) { AVL_REINSERT(t, obj); return (B_TRUE); } neighbor = AVL_NEXT(t, obj); if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) { AVL_REINSERT(t, obj); return (B_TRUE); } return (B_FALSE); } void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2) { avl_node_t *temp_node; ulong_t temp_numnodes; ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar); ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset); ASSERT3U(tree1->avl_size, ==, tree2->avl_size); temp_node = tree1->avl_root; temp_numnodes = tree1->avl_numnodes; tree1->avl_root = tree2->avl_root; tree1->avl_numnodes = tree2->avl_numnodes; tree2->avl_root = temp_node; tree2->avl_numnodes = temp_numnodes; } /* * initialize a new AVL tree */ void avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *), size_t size, size_t offset) { ASSERT(tree); ASSERT(compar); ASSERT(size > 0); ASSERT(size >= offset + sizeof (avl_node_t)); #ifdef _LP64 ASSERT((offset & 0x7) == 0); #endif tree->avl_compar = compar; tree->avl_root = NULL; tree->avl_numnodes = 0; tree->avl_size = size; tree->avl_offset = offset; } /* * Delete a tree. */ /* ARGSUSED */ void avl_destroy(avl_tree_t *tree) { ASSERT(tree); ASSERT(tree->avl_numnodes == 0); ASSERT(tree->avl_root == NULL); } /* * Return the number of nodes in an AVL tree. */ ulong_t avl_numnodes(avl_tree_t *tree) { ASSERT(tree); return (tree->avl_numnodes); } boolean_t avl_is_empty(avl_tree_t *tree) { ASSERT(tree); return (tree->avl_numnodes == 0); } #define CHILDBIT (1L) /* * Post-order tree walk used to visit all tree nodes and destroy the tree * in post order. This is used for removing all the nodes from a tree without * paying any cost for rebalancing it. * * example: * * void *cookie = NULL; * my_data_t *node; * * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL) * free(node); * avl_destroy(tree); * * The cookie is really an avl_node_t to the current node's parent and * an indication of which child you looked at last. * * On input, a cookie value of CHILDBIT indicates the tree is done. */ void * avl_destroy_nodes(avl_tree_t *tree, void **cookie) { avl_node_t *node; avl_node_t *parent; int child; void *first; size_t off = tree->avl_offset; /* * Initial calls go to the first node or it's right descendant. */ if (*cookie == NULL) { first = avl_first(tree); /* * deal with an empty tree */ if (first == NULL) { *cookie = (void *)CHILDBIT; return (NULL); } node = AVL_DATA2NODE(first, off); parent = AVL_XPARENT(node); goto check_right_side; } /* * If there is no parent to return to we are done. */ parent = (avl_node_t *)((uintptr_t)(*cookie) & ~CHILDBIT); if (parent == NULL) { if (tree->avl_root != NULL) { ASSERT(tree->avl_numnodes == 1); tree->avl_root = NULL; tree->avl_numnodes = 0; } return (NULL); } /* * Remove the child pointer we just visited from the parent and tree. */ child = (uintptr_t)(*cookie) & CHILDBIT; parent->avl_child[child] = NULL; ASSERT(tree->avl_numnodes > 1); --tree->avl_numnodes; /* * If we just did a right child or there isn't one, go up to parent. */ if (child == 1 || parent->avl_child[1] == NULL) { node = parent; parent = AVL_XPARENT(parent); goto done; } /* * Do parent's right child, then leftmost descendent. */ node = parent->avl_child[1]; while (node->avl_child[0] != NULL) { parent = node; node = node->avl_child[0]; } /* * If here, we moved to a left child. It may have one * child on the right (when balance == +1). */ check_right_side: if (node->avl_child[1] != NULL) { ASSERT(AVL_XBALANCE(node) == 1); parent = node; node = node->avl_child[1]; ASSERT(node->avl_child[0] == NULL && node->avl_child[1] == NULL); } else { ASSERT(AVL_XBALANCE(node) <= 0); } done: if (parent == NULL) { *cookie = (void *)CHILDBIT; ASSERT(node == tree->avl_root); } else { *cookie = (void *)((uintptr_t)parent | AVL_XCHILD(node)); } return (AVL_NODE2DATA(node, off)); } #if defined(_KERNEL) static int __init avl_init(void) { return (0); } static void __exit avl_fini(void) { } module_init(avl_init); module_exit(avl_fini); #endif ZFS_MODULE_DESCRIPTION("Generic AVL tree implementation"); ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); ZFS_MODULE_LICENSE(ZFS_META_LICENSE); ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); EXPORT_SYMBOL(avl_create); EXPORT_SYMBOL(avl_find); EXPORT_SYMBOL(avl_insert); EXPORT_SYMBOL(avl_insert_here); EXPORT_SYMBOL(avl_walk); EXPORT_SYMBOL(avl_first); EXPORT_SYMBOL(avl_last); EXPORT_SYMBOL(avl_nearest); EXPORT_SYMBOL(avl_add); EXPORT_SYMBOL(avl_swap); EXPORT_SYMBOL(avl_is_empty); EXPORT_SYMBOL(avl_remove); EXPORT_SYMBOL(avl_numnodes); EXPORT_SYMBOL(avl_destroy_nodes); EXPORT_SYMBOL(avl_destroy); EXPORT_SYMBOL(avl_update_lt); EXPORT_SYMBOL(avl_update_gt); EXPORT_SYMBOL(avl_update); Index: vendor-sys/openzfs/dist/module/nvpair/nvpair.c =================================================================== --- vendor-sys/openzfs/dist/module/nvpair/nvpair.c (revision 365339) +++ vendor-sys/openzfs/dist/module/nvpair/nvpair.c (revision 365340) @@ -1,3729 +1,3738 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2015, 2017 by Delphix. All rights reserved. * Copyright 2018 RackTop Systems. */ +/* + * Links to Illumos.org for more information on Interface Libraries: + * [1] https://illumos.org/man/3lib/libnvpair + * [2] https://illumos.org/man/3nvpair/nvlist_alloc + * [3] https://illumos.org/man/9f/nvlist_alloc + * [4] https://illumos.org/man/9f/nvlist_next_nvpair + * [5] https://illumos.org/man/9f/nvpair_value_byte + */ + #include #include #include #include #include #include #include #include #include #if defined(_KERNEL) #include #include #else #include #include #include #endif #define skip_whitespace(p) while ((*(p) == ' ') || (*(p) == '\t')) p++ /* * nvpair.c - Provides kernel & userland interfaces for manipulating * name-value pairs. * * Overview Diagram * * +--------------+ * | nvlist_t | * |--------------| * | nvl_version | * | nvl_nvflag | * | nvl_priv -+-+ * | nvl_flag | | * | nvl_pad | | * +--------------+ | * V * +--------------+ last i_nvp in list * | nvpriv_t | +---------------------> * |--------------| | * +--+- nvp_list | | +------------+ * | | nvp_last -+--+ + nv_alloc_t | * | | nvp_curr | |------------| * | | nvp_nva -+----> | nva_ops | * | | nvp_stat | | nva_arg | * | +--------------+ +------------+ * | * +-------+ * V * +---------------------+ +-------------------+ * | i_nvp_t | +-->| i_nvp_t | +--> * |---------------------| | |-------------------| | * | nvi_next -+--+ | nvi_next -+--+ * | nvi_prev (NULL) | <----+ nvi_prev | * | . . . . . . . . . . | | . . . . . . . . . | * | nvp (nvpair_t) | | nvp (nvpair_t) | * | - nvp_size | | - nvp_size | * | - nvp_name_sz | | - nvp_name_sz | * | - nvp_value_elem | | - nvp_value_elem | * | - nvp_type | | - nvp_type | * | - data ... | | - data ... | * +---------------------+ +-------------------+ * * * * +---------------------+ +---------------------+ * | i_nvp_t | +--> +-->| i_nvp_t (last) | * |---------------------| | | |---------------------| * | nvi_next -+--+ ... --+ | nvi_next (NULL) | * <-+- nvi_prev |<-- ... <----+ nvi_prev | * | . . . . . . . . . | | . . . . . . . . . | * | nvp (nvpair_t) | | nvp (nvpair_t) | * | - nvp_size | | - nvp_size | * | - nvp_name_sz | | - nvp_name_sz | * | - nvp_value_elem | | - nvp_value_elem | * | - DATA_TYPE_NVLIST | | - nvp_type | * | - data (embedded) | | - data ... | * | nvlist name | +---------------------+ * | +--------------+ | * | | nvlist_t | | * | |--------------| | * | | nvl_version | | * | | nvl_nvflag | | * | | nvl_priv --+---+----> * | | nvl_flag | | * | | nvl_pad | | * | +--------------+ | * +---------------------+ * * * N.B. nvpair_t may be aligned on 4 byte boundary, so +4 will * allow value to be aligned on 8 byte boundary * * name_len is the length of the name string including the null terminator * so it must be >= 1 */ #define NVP_SIZE_CALC(name_len, data_len) \ (NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len)) static int i_get_value_size(data_type_t type, const void *data, uint_t nelem); static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type, uint_t nelem, const void *data); #define NV_STAT_EMBEDDED 0x1 #define EMBEDDED_NVL(nvp) ((nvlist_t *)(void *)NVP_VALUE(nvp)) #define EMBEDDED_NVL_ARRAY(nvp) ((nvlist_t **)(void *)NVP_VALUE(nvp)) #define NVP_VALOFF(nvp) (NV_ALIGN(sizeof (nvpair_t) + (nvp)->nvp_name_sz)) #define NVPAIR2I_NVP(nvp) \ ((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp))) #ifdef _KERNEL int nvpair_max_recursion = 20; #else int nvpair_max_recursion = 100; #endif uint64_t nvlist_hashtable_init_size = (1 << 4); int nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...) { va_list valist; int err = 0; nva->nva_ops = nvo; nva->nva_arg = NULL; va_start(valist, nvo); if (nva->nva_ops->nv_ao_init != NULL) err = nva->nva_ops->nv_ao_init(nva, valist); va_end(valist); return (err); } void nv_alloc_reset(nv_alloc_t *nva) { if (nva->nva_ops->nv_ao_reset != NULL) nva->nva_ops->nv_ao_reset(nva); } void nv_alloc_fini(nv_alloc_t *nva) { if (nva->nva_ops->nv_ao_fini != NULL) nva->nva_ops->nv_ao_fini(nva); } nv_alloc_t * nvlist_lookup_nv_alloc(nvlist_t *nvl) { nvpriv_t *priv; if (nvl == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (NULL); return (priv->nvp_nva); } static void * nv_mem_zalloc(nvpriv_t *nvp, size_t size) { nv_alloc_t *nva = nvp->nvp_nva; void *buf; if ((buf = nva->nva_ops->nv_ao_alloc(nva, size)) != NULL) bzero(buf, size); return (buf); } static void nv_mem_free(nvpriv_t *nvp, void *buf, size_t size) { nv_alloc_t *nva = nvp->nvp_nva; nva->nva_ops->nv_ao_free(nva, buf, size); } static void nv_priv_init(nvpriv_t *priv, nv_alloc_t *nva, uint32_t stat) { bzero(priv, sizeof (nvpriv_t)); priv->nvp_nva = nva; priv->nvp_stat = stat; } static nvpriv_t * nv_priv_alloc(nv_alloc_t *nva) { nvpriv_t *priv; /* * nv_mem_alloc() cannot called here because it needs the priv * argument. */ if ((priv = nva->nva_ops->nv_ao_alloc(nva, sizeof (nvpriv_t))) == NULL) return (NULL); nv_priv_init(priv, nva, 0); return (priv); } /* * Embedded lists need their own nvpriv_t's. We create a new * nvpriv_t using the parameters and allocator from the parent * list's nvpriv_t. */ static nvpriv_t * nv_priv_alloc_embedded(nvpriv_t *priv) { nvpriv_t *emb_priv; if ((emb_priv = nv_mem_zalloc(priv, sizeof (nvpriv_t))) == NULL) return (NULL); nv_priv_init(emb_priv, priv->nvp_nva, NV_STAT_EMBEDDED); return (emb_priv); } static int nvt_tab_alloc(nvpriv_t *priv, uint64_t buckets) { ASSERT3P(priv->nvp_hashtable, ==, NULL); ASSERT0(priv->nvp_nbuckets); ASSERT0(priv->nvp_nentries); i_nvp_t **tab = nv_mem_zalloc(priv, buckets * sizeof (i_nvp_t *)); if (tab == NULL) return (ENOMEM); priv->nvp_hashtable = tab; priv->nvp_nbuckets = buckets; return (0); } static void nvt_tab_free(nvpriv_t *priv) { i_nvp_t **tab = priv->nvp_hashtable; if (tab == NULL) { ASSERT0(priv->nvp_nbuckets); ASSERT0(priv->nvp_nentries); return; } nv_mem_free(priv, tab, priv->nvp_nbuckets * sizeof (i_nvp_t *)); priv->nvp_hashtable = NULL; priv->nvp_nbuckets = 0; priv->nvp_nentries = 0; } static uint32_t nvt_hash(const char *p) { uint32_t g, hval = 0; while (*p) { hval = (hval << 4) + *p++; if ((g = (hval & 0xf0000000)) != 0) hval ^= g >> 24; hval &= ~g; } return (hval); } static boolean_t nvt_nvpair_match(nvpair_t *nvp1, nvpair_t *nvp2, uint32_t nvflag) { boolean_t match = B_FALSE; if (nvflag & NV_UNIQUE_NAME_TYPE) { if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0 && NVP_TYPE(nvp1) == NVP_TYPE(nvp2)) match = B_TRUE; } else { ASSERT(nvflag == 0 || nvflag & NV_UNIQUE_NAME); if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0) match = B_TRUE; } return (match); } static nvpair_t * nvt_lookup_name_type(nvlist_t *nvl, const char *name, data_type_t type) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; ASSERT(priv != NULL); i_nvp_t **tab = priv->nvp_hashtable; if (tab == NULL) { ASSERT3P(priv->nvp_list, ==, NULL); ASSERT0(priv->nvp_nbuckets); ASSERT0(priv->nvp_nentries); return (NULL); } else { ASSERT(priv->nvp_nbuckets != 0); } uint64_t hash = nvt_hash(name); uint64_t index = hash & (priv->nvp_nbuckets - 1); ASSERT3U(index, <, priv->nvp_nbuckets); i_nvp_t *entry = tab[index]; for (i_nvp_t *e = entry; e != NULL; e = e->nvi_hashtable_next) { if (strcmp(NVP_NAME(&e->nvi_nvp), name) == 0 && (type == DATA_TYPE_DONTCARE || NVP_TYPE(&e->nvi_nvp) == type)) return (&e->nvi_nvp); } return (NULL); } static nvpair_t * nvt_lookup_name(nvlist_t *nvl, const char *name) { return (nvt_lookup_name_type(nvl, name, DATA_TYPE_DONTCARE)); } static int nvt_resize(nvpriv_t *priv, uint32_t new_size) { i_nvp_t **tab = priv->nvp_hashtable; /* * Migrate all the entries from the current table * to a newly-allocated table with the new size by * re-adjusting the pointers of their entries. */ uint32_t size = priv->nvp_nbuckets; uint32_t new_mask = new_size - 1; ASSERT(ISP2(new_size)); i_nvp_t **new_tab = nv_mem_zalloc(priv, new_size * sizeof (i_nvp_t *)); if (new_tab == NULL) return (ENOMEM); uint32_t nentries = 0; for (uint32_t i = 0; i < size; i++) { i_nvp_t *next, *e = tab[i]; while (e != NULL) { next = e->nvi_hashtable_next; uint32_t hash = nvt_hash(NVP_NAME(&e->nvi_nvp)); uint32_t index = hash & new_mask; e->nvi_hashtable_next = new_tab[index]; new_tab[index] = e; nentries++; e = next; } tab[i] = NULL; } ASSERT3U(nentries, ==, priv->nvp_nentries); nvt_tab_free(priv); priv->nvp_hashtable = new_tab; priv->nvp_nbuckets = new_size; priv->nvp_nentries = nentries; return (0); } static boolean_t nvt_needs_togrow(nvpriv_t *priv) { /* * Grow only when we have more elements than buckets * and the # of buckets doesn't overflow. */ return (priv->nvp_nentries > priv->nvp_nbuckets && (UINT32_MAX >> 1) >= priv->nvp_nbuckets); } /* * Allocate a new table that's twice the size of the old one, * and migrate all the entries from the old one to the new * one by re-adjusting their pointers. */ static int nvt_grow(nvpriv_t *priv) { uint32_t current_size = priv->nvp_nbuckets; /* ensure we won't overflow */ ASSERT3U(UINT32_MAX >> 1, >=, current_size); return (nvt_resize(priv, current_size << 1)); } static boolean_t nvt_needs_toshrink(nvpriv_t *priv) { /* * Shrink only when the # of elements is less than or * equal to 1/4 the # of buckets. Never shrink less than * nvlist_hashtable_init_size. */ ASSERT3U(priv->nvp_nbuckets, >=, nvlist_hashtable_init_size); if (priv->nvp_nbuckets == nvlist_hashtable_init_size) return (B_FALSE); return (priv->nvp_nentries <= (priv->nvp_nbuckets >> 2)); } /* * Allocate a new table that's half the size of the old one, * and migrate all the entries from the old one to the new * one by re-adjusting their pointers. */ static int nvt_shrink(nvpriv_t *priv) { uint32_t current_size = priv->nvp_nbuckets; /* ensure we won't overflow */ ASSERT3U(current_size, >=, nvlist_hashtable_init_size); return (nvt_resize(priv, current_size >> 1)); } static int nvt_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; if (nvt_needs_toshrink(priv)) { int err = nvt_shrink(priv); if (err != 0) return (err); } i_nvp_t **tab = priv->nvp_hashtable; char *name = NVP_NAME(nvp); uint64_t hash = nvt_hash(name); uint64_t index = hash & (priv->nvp_nbuckets - 1); ASSERT3U(index, <, priv->nvp_nbuckets); i_nvp_t *bucket = tab[index]; for (i_nvp_t *prev = NULL, *e = bucket; e != NULL; prev = e, e = e->nvi_hashtable_next) { if (nvt_nvpair_match(&e->nvi_nvp, nvp, nvl->nvl_nvflag)) { if (prev != NULL) { prev->nvi_hashtable_next = e->nvi_hashtable_next; } else { ASSERT3P(e, ==, bucket); tab[index] = e->nvi_hashtable_next; } e->nvi_hashtable_next = NULL; priv->nvp_nentries--; break; } } return (0); } static int nvt_add_nvpair(nvlist_t *nvl, nvpair_t *nvp) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; /* initialize nvpair table now if it doesn't exist. */ if (priv->nvp_hashtable == NULL) { int err = nvt_tab_alloc(priv, nvlist_hashtable_init_size); if (err != 0) return (err); } /* * if we don't allow duplicate entries, make sure to * unlink any existing entries from the table. */ if (nvl->nvl_nvflag != 0) { int err = nvt_remove_nvpair(nvl, nvp); if (err != 0) return (err); } if (nvt_needs_togrow(priv)) { int err = nvt_grow(priv); if (err != 0) return (err); } i_nvp_t **tab = priv->nvp_hashtable; char *name = NVP_NAME(nvp); uint64_t hash = nvt_hash(name); uint64_t index = hash & (priv->nvp_nbuckets - 1); ASSERT3U(index, <, priv->nvp_nbuckets); i_nvp_t *bucket = tab[index]; /* insert link at the beginning of the bucket */ i_nvp_t *new_entry = NVPAIR2I_NVP(nvp); ASSERT3P(new_entry->nvi_hashtable_next, ==, NULL); new_entry->nvi_hashtable_next = bucket; tab[index] = new_entry; priv->nvp_nentries++; return (0); } static void nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv) { nvl->nvl_version = NV_VERSION; nvl->nvl_nvflag = nvflag & (NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE); nvl->nvl_priv = (uint64_t)(uintptr_t)priv; nvl->nvl_flag = 0; nvl->nvl_pad = 0; } uint_t nvlist_nvflag(nvlist_t *nvl) { return (nvl->nvl_nvflag); } static nv_alloc_t * nvlist_nv_alloc(int kmflag) { #if defined(_KERNEL) switch (kmflag) { case KM_SLEEP: return (nv_alloc_sleep); case KM_NOSLEEP: return (nv_alloc_nosleep); default: return (nv_alloc_pushpage); } #else return (nv_alloc_nosleep); #endif /* _KERNEL */ } /* * nvlist_alloc - Allocate nvlist. */ int nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag) { return (nvlist_xalloc(nvlp, nvflag, nvlist_nv_alloc(kmflag))); } int nvlist_xalloc(nvlist_t **nvlp, uint_t nvflag, nv_alloc_t *nva) { nvpriv_t *priv; if (nvlp == NULL || nva == NULL) return (EINVAL); if ((priv = nv_priv_alloc(nva)) == NULL) return (ENOMEM); if ((*nvlp = nv_mem_zalloc(priv, NV_ALIGN(sizeof (nvlist_t)))) == NULL) { nv_mem_free(priv, priv, sizeof (nvpriv_t)); return (ENOMEM); } nvlist_init(*nvlp, nvflag, priv); return (0); } /* * nvp_buf_alloc - Allocate i_nvp_t for storing a new nv pair. */ static nvpair_t * nvp_buf_alloc(nvlist_t *nvl, size_t len) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; i_nvp_t *buf; nvpair_t *nvp; size_t nvsize; /* * Allocate the buffer */ nvsize = len + offsetof(i_nvp_t, nvi_nvp); if ((buf = nv_mem_zalloc(priv, nvsize)) == NULL) return (NULL); nvp = &buf->nvi_nvp; nvp->nvp_size = len; return (nvp); } /* * nvp_buf_free - de-Allocate an i_nvp_t. */ static void nvp_buf_free(nvlist_t *nvl, nvpair_t *nvp) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; size_t nvsize = nvp->nvp_size + offsetof(i_nvp_t, nvi_nvp); nv_mem_free(priv, NVPAIR2I_NVP(nvp), nvsize); } /* * nvp_buf_link - link a new nv pair into the nvlist. */ static void nvp_buf_link(nvlist_t *nvl, nvpair_t *nvp) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; i_nvp_t *curr = NVPAIR2I_NVP(nvp); /* Put element at end of nvlist */ if (priv->nvp_list == NULL) { priv->nvp_list = priv->nvp_last = curr; } else { curr->nvi_prev = priv->nvp_last; priv->nvp_last->nvi_next = curr; priv->nvp_last = curr; } } /* * nvp_buf_unlink - unlink an removed nvpair out of the nvlist. */ static void nvp_buf_unlink(nvlist_t *nvl, nvpair_t *nvp) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; i_nvp_t *curr = NVPAIR2I_NVP(nvp); /* * protect nvlist_next_nvpair() against walking on freed memory. */ if (priv->nvp_curr == curr) priv->nvp_curr = curr->nvi_next; if (curr == priv->nvp_list) priv->nvp_list = curr->nvi_next; else curr->nvi_prev->nvi_next = curr->nvi_next; if (curr == priv->nvp_last) priv->nvp_last = curr->nvi_prev; else curr->nvi_next->nvi_prev = curr->nvi_prev; } /* * take a nvpair type and number of elements and make sure the are valid */ static int i_validate_type_nelem(data_type_t type, uint_t nelem) { switch (type) { case DATA_TYPE_BOOLEAN: if (nelem != 0) return (EINVAL); break; case DATA_TYPE_BOOLEAN_VALUE: case DATA_TYPE_BYTE: case DATA_TYPE_INT8: case DATA_TYPE_UINT8: case DATA_TYPE_INT16: case DATA_TYPE_UINT16: case DATA_TYPE_INT32: case DATA_TYPE_UINT32: case DATA_TYPE_INT64: case DATA_TYPE_UINT64: case DATA_TYPE_STRING: case DATA_TYPE_HRTIME: case DATA_TYPE_NVLIST: #if !defined(_KERNEL) case DATA_TYPE_DOUBLE: #endif if (nelem != 1) return (EINVAL); break; case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_BYTE_ARRAY: case DATA_TYPE_INT8_ARRAY: case DATA_TYPE_UINT8_ARRAY: case DATA_TYPE_INT16_ARRAY: case DATA_TYPE_UINT16_ARRAY: case DATA_TYPE_INT32_ARRAY: case DATA_TYPE_UINT32_ARRAY: case DATA_TYPE_INT64_ARRAY: case DATA_TYPE_UINT64_ARRAY: case DATA_TYPE_STRING_ARRAY: case DATA_TYPE_NVLIST_ARRAY: /* we allow arrays with 0 elements */ break; default: return (EINVAL); } return (0); } /* * Verify nvp_name_sz and check the name string length. */ static int i_validate_nvpair_name(nvpair_t *nvp) { if ((nvp->nvp_name_sz <= 0) || (nvp->nvp_size < NVP_SIZE_CALC(nvp->nvp_name_sz, 0))) return (EFAULT); /* verify the name string, make sure its terminated */ if (NVP_NAME(nvp)[nvp->nvp_name_sz - 1] != '\0') return (EFAULT); return (strlen(NVP_NAME(nvp)) == nvp->nvp_name_sz - 1 ? 0 : EFAULT); } static int i_validate_nvpair_value(data_type_t type, uint_t nelem, const void *data) { switch (type) { case DATA_TYPE_BOOLEAN_VALUE: if (*(boolean_t *)data != B_TRUE && *(boolean_t *)data != B_FALSE) return (EINVAL); break; case DATA_TYPE_BOOLEAN_ARRAY: { int i; for (i = 0; i < nelem; i++) if (((boolean_t *)data)[i] != B_TRUE && ((boolean_t *)data)[i] != B_FALSE) return (EINVAL); break; } default: break; } return (0); } /* * This function takes a pointer to what should be a nvpair and it's size * and then verifies that all the nvpair fields make sense and can be * trusted. This function is used when decoding packed nvpairs. */ static int i_validate_nvpair(nvpair_t *nvp) { data_type_t type = NVP_TYPE(nvp); int size1, size2; /* verify nvp_name_sz, check the name string length */ if (i_validate_nvpair_name(nvp) != 0) return (EFAULT); if (i_validate_nvpair_value(type, NVP_NELEM(nvp), NVP_VALUE(nvp)) != 0) return (EFAULT); /* * verify nvp_type, nvp_value_elem, and also possibly * verify string values and get the value size. */ size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp)); size1 = nvp->nvp_size - NVP_VALOFF(nvp); if (size2 < 0 || size1 != NV_ALIGN(size2)) return (EFAULT); return (0); } static int nvlist_copy_pairs(nvlist_t *snvl, nvlist_t *dnvl) { nvpriv_t *priv; i_nvp_t *curr; if ((priv = (nvpriv_t *)(uintptr_t)snvl->nvl_priv) == NULL) return (EINVAL); for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { nvpair_t *nvp = &curr->nvi_nvp; int err; if ((err = nvlist_add_common(dnvl, NVP_NAME(nvp), NVP_TYPE(nvp), NVP_NELEM(nvp), NVP_VALUE(nvp))) != 0) return (err); } return (0); } /* * Frees all memory allocated for an nvpair (like embedded lists) with * the exception of the nvpair buffer itself. */ static void nvpair_free(nvpair_t *nvp) { switch (NVP_TYPE(nvp)) { case DATA_TYPE_NVLIST: nvlist_free(EMBEDDED_NVL(nvp)); break; case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); int i; for (i = 0; i < NVP_NELEM(nvp); i++) if (nvlp[i] != NULL) nvlist_free(nvlp[i]); break; } default: break; } } /* * nvlist_free - free an unpacked nvlist */ void nvlist_free(nvlist_t *nvl) { nvpriv_t *priv; i_nvp_t *curr; if (nvl == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return; /* * Unpacked nvlist are linked through i_nvp_t */ curr = priv->nvp_list; while (curr != NULL) { nvpair_t *nvp = &curr->nvi_nvp; curr = curr->nvi_next; nvpair_free(nvp); nvp_buf_free(nvl, nvp); } if (!(priv->nvp_stat & NV_STAT_EMBEDDED)) nv_mem_free(priv, nvl, NV_ALIGN(sizeof (nvlist_t))); else nvl->nvl_priv = 0; nvt_tab_free(priv); nv_mem_free(priv, priv, sizeof (nvpriv_t)); } static int nvlist_contains_nvp(nvlist_t *nvl, nvpair_t *nvp) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; i_nvp_t *curr; if (nvp == NULL) return (0); for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) if (&curr->nvi_nvp == nvp) return (1); return (0); } /* * Make a copy of nvlist */ int nvlist_dup(nvlist_t *nvl, nvlist_t **nvlp, int kmflag) { return (nvlist_xdup(nvl, nvlp, nvlist_nv_alloc(kmflag))); } int nvlist_xdup(nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva) { int err; nvlist_t *ret; if (nvl == NULL || nvlp == NULL) return (EINVAL); if ((err = nvlist_xalloc(&ret, nvl->nvl_nvflag, nva)) != 0) return (err); if ((err = nvlist_copy_pairs(nvl, ret)) != 0) nvlist_free(ret); else *nvlp = ret; return (err); } /* * Remove all with matching name */ int nvlist_remove_all(nvlist_t *nvl, const char *name) { int error = ENOENT; if (nvl == NULL || name == NULL || nvl->nvl_priv == 0) return (EINVAL); nvpair_t *nvp; while ((nvp = nvt_lookup_name(nvl, name)) != NULL) { VERIFY0(nvlist_remove_nvpair(nvl, nvp)); error = 0; } return (error); } /* * Remove first one with matching name and type */ int nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type) { if (nvl == NULL || name == NULL || nvl->nvl_priv == 0) return (EINVAL); nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type); if (nvp == NULL) return (ENOENT); return (nvlist_remove_nvpair(nvl, nvp)); } int nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp) { if (nvl == NULL || nvp == NULL) return (EINVAL); int err = nvt_remove_nvpair(nvl, nvp); if (err != 0) return (err); nvp_buf_unlink(nvl, nvp); nvpair_free(nvp); nvp_buf_free(nvl, nvp); return (0); } /* * This function calculates the size of an nvpair value. * * The data argument controls the behavior in case of the data types * DATA_TYPE_STRING and * DATA_TYPE_STRING_ARRAY * Is data == NULL then the size of the string(s) is excluded. */ static int i_get_value_size(data_type_t type, const void *data, uint_t nelem) { uint64_t value_sz; if (i_validate_type_nelem(type, nelem) != 0) return (-1); /* Calculate required size for holding value */ switch (type) { case DATA_TYPE_BOOLEAN: value_sz = 0; break; case DATA_TYPE_BOOLEAN_VALUE: value_sz = sizeof (boolean_t); break; case DATA_TYPE_BYTE: value_sz = sizeof (uchar_t); break; case DATA_TYPE_INT8: value_sz = sizeof (int8_t); break; case DATA_TYPE_UINT8: value_sz = sizeof (uint8_t); break; case DATA_TYPE_INT16: value_sz = sizeof (int16_t); break; case DATA_TYPE_UINT16: value_sz = sizeof (uint16_t); break; case DATA_TYPE_INT32: value_sz = sizeof (int32_t); break; case DATA_TYPE_UINT32: value_sz = sizeof (uint32_t); break; case DATA_TYPE_INT64: value_sz = sizeof (int64_t); break; case DATA_TYPE_UINT64: value_sz = sizeof (uint64_t); break; #if !defined(_KERNEL) case DATA_TYPE_DOUBLE: value_sz = sizeof (double); break; #endif case DATA_TYPE_STRING: if (data == NULL) value_sz = 0; else value_sz = strlen(data) + 1; break; case DATA_TYPE_BOOLEAN_ARRAY: value_sz = (uint64_t)nelem * sizeof (boolean_t); break; case DATA_TYPE_BYTE_ARRAY: value_sz = (uint64_t)nelem * sizeof (uchar_t); break; case DATA_TYPE_INT8_ARRAY: value_sz = (uint64_t)nelem * sizeof (int8_t); break; case DATA_TYPE_UINT8_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint8_t); break; case DATA_TYPE_INT16_ARRAY: value_sz = (uint64_t)nelem * sizeof (int16_t); break; case DATA_TYPE_UINT16_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint16_t); break; case DATA_TYPE_INT32_ARRAY: value_sz = (uint64_t)nelem * sizeof (int32_t); break; case DATA_TYPE_UINT32_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint32_t); break; case DATA_TYPE_INT64_ARRAY: value_sz = (uint64_t)nelem * sizeof (int64_t); break; case DATA_TYPE_UINT64_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint64_t); break; case DATA_TYPE_STRING_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint64_t); if (data != NULL) { char *const *strs = data; uint_t i; /* no alignment requirement for strings */ for (i = 0; i < nelem; i++) { if (strs[i] == NULL) return (-1); value_sz += strlen(strs[i]) + 1; } } break; case DATA_TYPE_HRTIME: value_sz = sizeof (hrtime_t); break; case DATA_TYPE_NVLIST: value_sz = NV_ALIGN(sizeof (nvlist_t)); break; case DATA_TYPE_NVLIST_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint64_t) + (uint64_t)nelem * NV_ALIGN(sizeof (nvlist_t)); break; default: return (-1); } return (value_sz > INT32_MAX ? -1 : (int)value_sz); } static int nvlist_copy_embedded(nvlist_t *nvl, nvlist_t *onvl, nvlist_t *emb_nvl) { nvpriv_t *priv; int err; if ((priv = nv_priv_alloc_embedded((nvpriv_t *)(uintptr_t) nvl->nvl_priv)) == NULL) return (ENOMEM); nvlist_init(emb_nvl, onvl->nvl_nvflag, priv); if ((err = nvlist_copy_pairs(onvl, emb_nvl)) != 0) { nvlist_free(emb_nvl); emb_nvl->nvl_priv = 0; } return (err); } /* * nvlist_add_common - Add new pair to nvlist */ static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type, uint_t nelem, const void *data) { nvpair_t *nvp; uint_t i; int nvp_sz, name_sz, value_sz; int err = 0; if (name == NULL || nvl == NULL || nvl->nvl_priv == 0) return (EINVAL); if (nelem != 0 && data == NULL) return (EINVAL); /* * Verify type and nelem and get the value size. * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY * is the size of the string(s) included. */ if ((value_sz = i_get_value_size(type, data, nelem)) < 0) return (EINVAL); if (i_validate_nvpair_value(type, nelem, data) != 0) return (EINVAL); /* * If we're adding an nvlist or nvlist array, ensure that we are not * adding the input nvlist to itself, which would cause recursion, * and ensure that no NULL nvlist pointers are present. */ switch (type) { case DATA_TYPE_NVLIST: if (data == nvl || data == NULL) return (EINVAL); break; case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **onvlp = (nvlist_t **)data; for (i = 0; i < nelem; i++) { if (onvlp[i] == nvl || onvlp[i] == NULL) return (EINVAL); } break; } default: break; } /* calculate sizes of the nvpair elements and the nvpair itself */ name_sz = strlen(name) + 1; if (name_sz >= 1ULL << (sizeof (nvp->nvp_name_sz) * NBBY - 1)) return (EINVAL); nvp_sz = NVP_SIZE_CALC(name_sz, value_sz); if ((nvp = nvp_buf_alloc(nvl, nvp_sz)) == NULL) return (ENOMEM); ASSERT(nvp->nvp_size == nvp_sz); nvp->nvp_name_sz = name_sz; nvp->nvp_value_elem = nelem; nvp->nvp_type = type; bcopy(name, NVP_NAME(nvp), name_sz); switch (type) { case DATA_TYPE_BOOLEAN: break; case DATA_TYPE_STRING_ARRAY: { char *const *strs = data; char *buf = NVP_VALUE(nvp); char **cstrs = (void *)buf; /* skip pre-allocated space for pointer array */ buf += nelem * sizeof (uint64_t); for (i = 0; i < nelem; i++) { int slen = strlen(strs[i]) + 1; bcopy(strs[i], buf, slen); cstrs[i] = buf; buf += slen; } break; } case DATA_TYPE_NVLIST: { nvlist_t *nnvl = EMBEDDED_NVL(nvp); nvlist_t *onvl = (nvlist_t *)data; if ((err = nvlist_copy_embedded(nvl, onvl, nnvl)) != 0) { nvp_buf_free(nvl, nvp); return (err); } break; } case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **onvlp = (nvlist_t **)data; nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); nvlist_t *embedded = (nvlist_t *) ((uintptr_t)nvlp + nelem * sizeof (uint64_t)); for (i = 0; i < nelem; i++) { if ((err = nvlist_copy_embedded(nvl, onvlp[i], embedded)) != 0) { /* * Free any successfully created lists */ nvpair_free(nvp); nvp_buf_free(nvl, nvp); return (err); } nvlp[i] = embedded++; } break; } default: bcopy(data, NVP_VALUE(nvp), value_sz); } /* if unique name, remove before add */ if (nvl->nvl_nvflag & NV_UNIQUE_NAME) (void) nvlist_remove_all(nvl, name); else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE) (void) nvlist_remove(nvl, name, type); err = nvt_add_nvpair(nvl, nvp); if (err != 0) { nvpair_free(nvp); nvp_buf_free(nvl, nvp); return (err); } nvp_buf_link(nvl, nvp); return (0); } int nvlist_add_boolean(nvlist_t *nvl, const char *name) { return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN, 0, NULL)); } int nvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1, &val)); } int nvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &val)); } int nvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &val)); } int nvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &val)); } int nvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &val)); } int nvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &val)); } int nvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &val)); } int nvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &val)); } int nvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &val)); } int nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val)); } #if !defined(_KERNEL) int nvlist_add_double(nvlist_t *nvl, const char *name, double val) { return (nvlist_add_common(nvl, name, DATA_TYPE_DOUBLE, 1, &val)); } #endif int nvlist_add_string(nvlist_t *nvl, const char *name, const char *val) { return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, (void *)val)); } int nvlist_add_boolean_array(nvlist_t *nvl, const char *name, boolean_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a)); } int nvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a)); } int nvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a)); } int nvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a)); } int nvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a)); } int nvlist_add_uint16_array(nvlist_t *nvl, const char *name, uint16_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a)); } int nvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a)); } int nvlist_add_uint32_array(nvlist_t *nvl, const char *name, uint32_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a)); } int nvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a)); } int nvlist_add_uint64_array(nvlist_t *nvl, const char *name, uint64_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a)); } int nvlist_add_string_array(nvlist_t *nvl, const char *name, char *const *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a)); } int nvlist_add_hrtime(nvlist_t *nvl, const char *name, hrtime_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_HRTIME, 1, &val)); } int nvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val) { return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val)); } int nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a)); } /* reading name-value pairs */ nvpair_t * nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp) { nvpriv_t *priv; i_nvp_t *curr; if (nvl == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (NULL); curr = NVPAIR2I_NVP(nvp); /* * Ensure that nvp is a valid nvpair on this nvlist. * NB: nvp_curr is used only as a hint so that we don't always * have to walk the list to determine if nvp is still on the list. */ if (nvp == NULL) curr = priv->nvp_list; else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp)) curr = curr->nvi_next; else curr = NULL; priv->nvp_curr = curr; return (curr != NULL ? &curr->nvi_nvp : NULL); } nvpair_t * nvlist_prev_nvpair(nvlist_t *nvl, nvpair_t *nvp) { nvpriv_t *priv; i_nvp_t *curr; if (nvl == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (NULL); curr = NVPAIR2I_NVP(nvp); if (nvp == NULL) curr = priv->nvp_last; else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp)) curr = curr->nvi_prev; else curr = NULL; priv->nvp_curr = curr; return (curr != NULL ? &curr->nvi_nvp : NULL); } boolean_t nvlist_empty(nvlist_t *nvl) { nvpriv_t *priv; if (nvl == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (B_TRUE); return (priv->nvp_list == NULL); } char * nvpair_name(nvpair_t *nvp) { return (NVP_NAME(nvp)); } data_type_t nvpair_type(nvpair_t *nvp) { return (NVP_TYPE(nvp)); } int nvpair_type_is_array(nvpair_t *nvp) { data_type_t type = NVP_TYPE(nvp); if ((type == DATA_TYPE_BYTE_ARRAY) || (type == DATA_TYPE_INT8_ARRAY) || (type == DATA_TYPE_UINT8_ARRAY) || (type == DATA_TYPE_INT16_ARRAY) || (type == DATA_TYPE_UINT16_ARRAY) || (type == DATA_TYPE_INT32_ARRAY) || (type == DATA_TYPE_UINT32_ARRAY) || (type == DATA_TYPE_INT64_ARRAY) || (type == DATA_TYPE_UINT64_ARRAY) || (type == DATA_TYPE_BOOLEAN_ARRAY) || (type == DATA_TYPE_STRING_ARRAY) || (type == DATA_TYPE_NVLIST_ARRAY)) return (1); return (0); } static int nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data) { int value_sz; if (nvp == NULL || nvpair_type(nvp) != type) return (EINVAL); /* * For non-array types, we copy the data. * For array types (including string), we set a pointer. */ switch (type) { case DATA_TYPE_BOOLEAN: if (nelem != NULL) *nelem = 0; break; case DATA_TYPE_BOOLEAN_VALUE: case DATA_TYPE_BYTE: case DATA_TYPE_INT8: case DATA_TYPE_UINT8: case DATA_TYPE_INT16: case DATA_TYPE_UINT16: case DATA_TYPE_INT32: case DATA_TYPE_UINT32: case DATA_TYPE_INT64: case DATA_TYPE_UINT64: case DATA_TYPE_HRTIME: #if !defined(_KERNEL) case DATA_TYPE_DOUBLE: #endif if (data == NULL) return (EINVAL); if ((value_sz = i_get_value_size(type, NULL, 1)) < 0) return (EINVAL); bcopy(NVP_VALUE(nvp), data, (size_t)value_sz); if (nelem != NULL) *nelem = 1; break; case DATA_TYPE_NVLIST: case DATA_TYPE_STRING: if (data == NULL) return (EINVAL); *(void **)data = (void *)NVP_VALUE(nvp); if (nelem != NULL) *nelem = 1; break; case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_BYTE_ARRAY: case DATA_TYPE_INT8_ARRAY: case DATA_TYPE_UINT8_ARRAY: case DATA_TYPE_INT16_ARRAY: case DATA_TYPE_UINT16_ARRAY: case DATA_TYPE_INT32_ARRAY: case DATA_TYPE_UINT32_ARRAY: case DATA_TYPE_INT64_ARRAY: case DATA_TYPE_UINT64_ARRAY: case DATA_TYPE_STRING_ARRAY: case DATA_TYPE_NVLIST_ARRAY: if (nelem == NULL || data == NULL) return (EINVAL); if ((*nelem = NVP_NELEM(nvp)) != 0) *(void **)data = (void *)NVP_VALUE(nvp); else *(void **)data = NULL; break; default: return (ENOTSUP); } return (0); } static int nvlist_lookup_common(nvlist_t *nvl, const char *name, data_type_t type, uint_t *nelem, void *data) { if (name == NULL || nvl == NULL || nvl->nvl_priv == 0) return (EINVAL); if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE))) return (ENOTSUP); nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type); if (nvp == NULL) return (ENOENT); return (nvpair_value_common(nvp, type, nelem, data)); } int nvlist_lookup_boolean(nvlist_t *nvl, const char *name) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN, NULL, NULL)); } int nvlist_lookup_boolean_value(nvlist_t *nvl, const char *name, boolean_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, NULL, val)); } int nvlist_lookup_byte(nvlist_t *nvl, const char *name, uchar_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE, NULL, val)); } int nvlist_lookup_int8(nvlist_t *nvl, const char *name, int8_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8, NULL, val)); } int nvlist_lookup_uint8(nvlist_t *nvl, const char *name, uint8_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8, NULL, val)); } int nvlist_lookup_int16(nvlist_t *nvl, const char *name, int16_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16, NULL, val)); } int nvlist_lookup_uint16(nvlist_t *nvl, const char *name, uint16_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16, NULL, val)); } int nvlist_lookup_int32(nvlist_t *nvl, const char *name, int32_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32, NULL, val)); } int nvlist_lookup_uint32(nvlist_t *nvl, const char *name, uint32_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32, NULL, val)); } int nvlist_lookup_int64(nvlist_t *nvl, const char *name, int64_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64, NULL, val)); } int nvlist_lookup_uint64(nvlist_t *nvl, const char *name, uint64_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val)); } #if !defined(_KERNEL) int nvlist_lookup_double(nvlist_t *nvl, const char *name, double *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_DOUBLE, NULL, val)); } #endif int nvlist_lookup_string(nvlist_t *nvl, const char *name, char **val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING, NULL, val)); } int nvlist_lookup_nvlist(nvlist_t *nvl, const char *name, nvlist_t **val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST, NULL, val)); } int nvlist_lookup_boolean_array(nvlist_t *nvl, const char *name, boolean_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a)); } int nvlist_lookup_byte_array(nvlist_t *nvl, const char *name, uchar_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a)); } int nvlist_lookup_int8_array(nvlist_t *nvl, const char *name, int8_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a)); } int nvlist_lookup_uint8_array(nvlist_t *nvl, const char *name, uint8_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a)); } int nvlist_lookup_int16_array(nvlist_t *nvl, const char *name, int16_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a)); } int nvlist_lookup_uint16_array(nvlist_t *nvl, const char *name, uint16_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a)); } int nvlist_lookup_int32_array(nvlist_t *nvl, const char *name, int32_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a)); } int nvlist_lookup_uint32_array(nvlist_t *nvl, const char *name, uint32_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a)); } int nvlist_lookup_int64_array(nvlist_t *nvl, const char *name, int64_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a)); } int nvlist_lookup_uint64_array(nvlist_t *nvl, const char *name, uint64_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a)); } int nvlist_lookup_string_array(nvlist_t *nvl, const char *name, char ***a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a)); } int nvlist_lookup_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t ***a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a)); } int nvlist_lookup_hrtime(nvlist_t *nvl, const char *name, hrtime_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_HRTIME, NULL, val)); } int nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...) { va_list ap; char *name; int noentok = (flag & NV_FLAG_NOENTOK ? 1 : 0); int ret = 0; va_start(ap, flag); while (ret == 0 && (name = va_arg(ap, char *)) != NULL) { data_type_t type; void *val; uint_t *nelem; switch (type = va_arg(ap, data_type_t)) { case DATA_TYPE_BOOLEAN: ret = nvlist_lookup_common(nvl, name, type, NULL, NULL); break; case DATA_TYPE_BOOLEAN_VALUE: case DATA_TYPE_BYTE: case DATA_TYPE_INT8: case DATA_TYPE_UINT8: case DATA_TYPE_INT16: case DATA_TYPE_UINT16: case DATA_TYPE_INT32: case DATA_TYPE_UINT32: case DATA_TYPE_INT64: case DATA_TYPE_UINT64: case DATA_TYPE_HRTIME: case DATA_TYPE_STRING: case DATA_TYPE_NVLIST: #if !defined(_KERNEL) case DATA_TYPE_DOUBLE: #endif val = va_arg(ap, void *); ret = nvlist_lookup_common(nvl, name, type, NULL, val); break; case DATA_TYPE_BYTE_ARRAY: case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_INT8_ARRAY: case DATA_TYPE_UINT8_ARRAY: case DATA_TYPE_INT16_ARRAY: case DATA_TYPE_UINT16_ARRAY: case DATA_TYPE_INT32_ARRAY: case DATA_TYPE_UINT32_ARRAY: case DATA_TYPE_INT64_ARRAY: case DATA_TYPE_UINT64_ARRAY: case DATA_TYPE_STRING_ARRAY: case DATA_TYPE_NVLIST_ARRAY: val = va_arg(ap, void *); nelem = va_arg(ap, uint_t *); ret = nvlist_lookup_common(nvl, name, type, nelem, val); break; default: ret = EINVAL; } if (ret == ENOENT && noentok) ret = 0; } va_end(ap); return (ret); } /* * Find the 'name'ed nvpair in the nvlist 'nvl'. If 'name' found, the function * returns zero and a pointer to the matching nvpair is returned in '*ret' * (given 'ret' is non-NULL). If 'sep' is specified then 'name' will penitrate * multiple levels of embedded nvlists, with 'sep' as the separator. As an * example, if sep is '.', name might look like: "a" or "a.b" or "a.c[3]" or * "a.d[3].e[1]". This matches the C syntax for array embed (for convenience, * code also supports "a.d[3]e[1]" syntax). * * If 'ip' is non-NULL and the last name component is an array, return the * value of the "...[index]" array index in *ip. For an array reference that * is not indexed, *ip will be returned as -1. If there is a syntax error in * 'name', and 'ep' is non-NULL then *ep will be set to point to the location * inside the 'name' string where the syntax error was detected. */ static int nvlist_lookup_nvpair_ei_sep(nvlist_t *nvl, const char *name, const char sep, nvpair_t **ret, int *ip, char **ep) { nvpair_t *nvp; const char *np; char *sepp = NULL; char *idxp, *idxep; nvlist_t **nva; long idx = 0; int n; if (ip) *ip = -1; /* not indexed */ if (ep) *ep = NULL; if ((nvl == NULL) || (name == NULL)) return (EINVAL); sepp = NULL; idx = 0; /* step through components of name */ for (np = name; np && *np; np = sepp) { /* ensure unique names */ if (!(nvl->nvl_nvflag & NV_UNIQUE_NAME)) return (ENOTSUP); /* skip white space */ skip_whitespace(np); if (*np == 0) break; /* set 'sepp' to end of current component 'np' */ if (sep) sepp = strchr(np, sep); else sepp = NULL; /* find start of next "[ index ]..." */ idxp = strchr(np, '['); /* if sepp comes first, set idxp to NULL */ if (sepp && idxp && (sepp < idxp)) idxp = NULL; /* * At this point 'idxp' is set if there is an index * expected for the current component. */ if (idxp) { /* set 'n' to length of current 'np' name component */ n = idxp++ - np; /* keep sepp up to date for *ep use as we advance */ skip_whitespace(idxp); sepp = idxp; /* determine the index value */ #if defined(_KERNEL) if (ddi_strtol(idxp, &idxep, 0, &idx)) goto fail; #else idx = strtol(idxp, &idxep, 0); #endif if (idxep == idxp) goto fail; /* keep sepp up to date for *ep use as we advance */ sepp = idxep; /* skip white space index value and check for ']' */ skip_whitespace(sepp); if (*sepp++ != ']') goto fail; /* for embedded arrays, support C syntax: "a[1].b" */ skip_whitespace(sepp); if (sep && (*sepp == sep)) sepp++; } else if (sepp) { n = sepp++ - np; } else { n = strlen(np); } /* trim trailing whitespace by reducing length of 'np' */ if (n == 0) goto fail; for (n--; (np[n] == ' ') || (np[n] == '\t'); n--) ; n++; /* skip whitespace, and set sepp to NULL if complete */ if (sepp) { skip_whitespace(sepp); if (*sepp == 0) sepp = NULL; } /* * At this point: * o 'n' is the length of current 'np' component. * o 'idxp' is set if there was an index, and value 'idx'. * o 'sepp' is set to the beginning of the next component, * and set to NULL if we have no more components. * * Search for nvpair with matching component name. */ for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { /* continue if no match on name */ if (strncmp(np, nvpair_name(nvp), n) || (strlen(nvpair_name(nvp)) != n)) continue; /* if indexed, verify type is array oriented */ if (idxp && !nvpair_type_is_array(nvp)) goto fail; /* * Full match found, return nvp and idx if this * was the last component. */ if (sepp == NULL) { if (ret) *ret = nvp; if (ip && idxp) *ip = (int)idx; /* return index */ return (0); /* found */ } /* * More components: current match must be * of DATA_TYPE_NVLIST or DATA_TYPE_NVLIST_ARRAY * to support going deeper. */ if (nvpair_type(nvp) == DATA_TYPE_NVLIST) { nvl = EMBEDDED_NVL(nvp); break; } else if (nvpair_type(nvp) == DATA_TYPE_NVLIST_ARRAY) { (void) nvpair_value_nvlist_array(nvp, &nva, (uint_t *)&n); if ((n < 0) || (idx >= n)) goto fail; nvl = nva[idx]; break; } /* type does not support more levels */ goto fail; } if (nvp == NULL) goto fail; /* 'name' not found */ /* search for match of next component in embedded 'nvl' list */ } fail: if (ep && sepp) *ep = sepp; return (EINVAL); } /* * Return pointer to nvpair with specified 'name'. */ int nvlist_lookup_nvpair(nvlist_t *nvl, const char *name, nvpair_t **ret) { return (nvlist_lookup_nvpair_ei_sep(nvl, name, 0, ret, NULL, NULL)); } /* * Determine if named nvpair exists in nvlist (use embedded separator of '.' * and return array index). See nvlist_lookup_nvpair_ei_sep for more detailed * description. */ int nvlist_lookup_nvpair_embedded_index(nvlist_t *nvl, const char *name, nvpair_t **ret, int *ip, char **ep) { return (nvlist_lookup_nvpair_ei_sep(nvl, name, '.', ret, ip, ep)); } boolean_t nvlist_exists(nvlist_t *nvl, const char *name) { nvpriv_t *priv; nvpair_t *nvp; i_nvp_t *curr; if (name == NULL || nvl == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (B_FALSE); for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { nvp = &curr->nvi_nvp; if (strcmp(name, NVP_NAME(nvp)) == 0) return (B_TRUE); } return (B_FALSE); } int nvpair_value_boolean_value(nvpair_t *nvp, boolean_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_VALUE, NULL, val)); } int nvpair_value_byte(nvpair_t *nvp, uchar_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_BYTE, NULL, val)); } int nvpair_value_int8(nvpair_t *nvp, int8_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_INT8, NULL, val)); } int nvpair_value_uint8(nvpair_t *nvp, uint8_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_UINT8, NULL, val)); } int nvpair_value_int16(nvpair_t *nvp, int16_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_INT16, NULL, val)); } int nvpair_value_uint16(nvpair_t *nvp, uint16_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_UINT16, NULL, val)); } int nvpair_value_int32(nvpair_t *nvp, int32_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_INT32, NULL, val)); } int nvpair_value_uint32(nvpair_t *nvp, uint32_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_UINT32, NULL, val)); } int nvpair_value_int64(nvpair_t *nvp, int64_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_INT64, NULL, val)); } int nvpair_value_uint64(nvpair_t *nvp, uint64_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val)); } #if !defined(_KERNEL) int nvpair_value_double(nvpair_t *nvp, double *val) { return (nvpair_value_common(nvp, DATA_TYPE_DOUBLE, NULL, val)); } #endif int nvpair_value_string(nvpair_t *nvp, char **val) { return (nvpair_value_common(nvp, DATA_TYPE_STRING, NULL, val)); } int nvpair_value_nvlist(nvpair_t *nvp, nvlist_t **val) { return (nvpair_value_common(nvp, DATA_TYPE_NVLIST, NULL, val)); } int nvpair_value_boolean_array(nvpair_t *nvp, boolean_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_ARRAY, nelem, val)); } int nvpair_value_byte_array(nvpair_t *nvp, uchar_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_BYTE_ARRAY, nelem, val)); } int nvpair_value_int8_array(nvpair_t *nvp, int8_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_INT8_ARRAY, nelem, val)); } int nvpair_value_uint8_array(nvpair_t *nvp, uint8_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_UINT8_ARRAY, nelem, val)); } int nvpair_value_int16_array(nvpair_t *nvp, int16_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_INT16_ARRAY, nelem, val)); } int nvpair_value_uint16_array(nvpair_t *nvp, uint16_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_UINT16_ARRAY, nelem, val)); } int nvpair_value_int32_array(nvpair_t *nvp, int32_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_INT32_ARRAY, nelem, val)); } int nvpair_value_uint32_array(nvpair_t *nvp, uint32_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_UINT32_ARRAY, nelem, val)); } int nvpair_value_int64_array(nvpair_t *nvp, int64_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_INT64_ARRAY, nelem, val)); } int nvpair_value_uint64_array(nvpair_t *nvp, uint64_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_UINT64_ARRAY, nelem, val)); } int nvpair_value_string_array(nvpair_t *nvp, char ***val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_STRING_ARRAY, nelem, val)); } int nvpair_value_nvlist_array(nvpair_t *nvp, nvlist_t ***val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_NVLIST_ARRAY, nelem, val)); } int nvpair_value_hrtime(nvpair_t *nvp, hrtime_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_HRTIME, NULL, val)); } /* * Add specified pair to the list. */ int nvlist_add_nvpair(nvlist_t *nvl, nvpair_t *nvp) { if (nvl == NULL || nvp == NULL) return (EINVAL); return (nvlist_add_common(nvl, NVP_NAME(nvp), NVP_TYPE(nvp), NVP_NELEM(nvp), NVP_VALUE(nvp))); } /* * Merge the supplied nvlists and put the result in dst. * The merged list will contain all names specified in both lists, * the values are taken from nvl in the case of duplicates. * Return 0 on success. */ /*ARGSUSED*/ int nvlist_merge(nvlist_t *dst, nvlist_t *nvl, int flag) { if (nvl == NULL || dst == NULL) return (EINVAL); if (dst != nvl) return (nvlist_copy_pairs(nvl, dst)); return (0); } /* * Encoding related routines */ #define NVS_OP_ENCODE 0 #define NVS_OP_DECODE 1 #define NVS_OP_GETSIZE 2 typedef struct nvs_ops nvs_ops_t; typedef struct { int nvs_op; const nvs_ops_t *nvs_ops; void *nvs_private; nvpriv_t *nvs_priv; int nvs_recursion; } nvstream_t; /* * nvs operations are: * - nvs_nvlist * encoding / decoding of an nvlist header (nvlist_t) * calculates the size used for header and end detection * * - nvs_nvpair * responsible for the first part of encoding / decoding of an nvpair * calculates the decoded size of an nvpair * * - nvs_nvp_op * second part of encoding / decoding of an nvpair * * - nvs_nvp_size * calculates the encoding size of an nvpair * * - nvs_nvl_fini * encodes the end detection mark (zeros). */ struct nvs_ops { int (*nvs_nvlist)(nvstream_t *, nvlist_t *, size_t *); int (*nvs_nvpair)(nvstream_t *, nvpair_t *, size_t *); int (*nvs_nvp_op)(nvstream_t *, nvpair_t *); int (*nvs_nvp_size)(nvstream_t *, nvpair_t *, size_t *); int (*nvs_nvl_fini)(nvstream_t *); }; typedef struct { char nvh_encoding; /* nvs encoding method */ char nvh_endian; /* nvs endian */ char nvh_reserved1; /* reserved for future use */ char nvh_reserved2; /* reserved for future use */ } nvs_header_t; static int nvs_encode_pairs(nvstream_t *nvs, nvlist_t *nvl) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; i_nvp_t *curr; /* * Walk nvpair in list and encode each nvpair */ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) if (nvs->nvs_ops->nvs_nvpair(nvs, &curr->nvi_nvp, NULL) != 0) return (EFAULT); return (nvs->nvs_ops->nvs_nvl_fini(nvs)); } static int nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl) { nvpair_t *nvp; size_t nvsize; int err; /* * Get decoded size of next pair in stream, alloc * memory for nvpair_t, then decode the nvpair */ while ((err = nvs->nvs_ops->nvs_nvpair(nvs, NULL, &nvsize)) == 0) { if (nvsize == 0) /* end of list */ break; /* make sure len makes sense */ if (nvsize < NVP_SIZE_CALC(1, 0)) return (EFAULT); if ((nvp = nvp_buf_alloc(nvl, nvsize)) == NULL) return (ENOMEM); if ((err = nvs->nvs_ops->nvs_nvp_op(nvs, nvp)) != 0) { nvp_buf_free(nvl, nvp); return (err); } if (i_validate_nvpair(nvp) != 0) { nvpair_free(nvp); nvp_buf_free(nvl, nvp); return (EFAULT); } err = nvt_add_nvpair(nvl, nvp); if (err != 0) { nvpair_free(nvp); nvp_buf_free(nvl, nvp); return (err); } nvp_buf_link(nvl, nvp); } return (err); } static int nvs_getsize_pairs(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; i_nvp_t *curr; uint64_t nvsize = *buflen; size_t size; /* * Get encoded size of nvpairs in nvlist */ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { if (nvs->nvs_ops->nvs_nvp_size(nvs, &curr->nvi_nvp, &size) != 0) return (EINVAL); if ((nvsize += size) > INT32_MAX) return (EINVAL); } *buflen = nvsize; return (0); } static int nvs_operation(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen) { int err; if (nvl->nvl_priv == 0) return (EFAULT); /* * Perform the operation, starting with header, then each nvpair */ if ((err = nvs->nvs_ops->nvs_nvlist(nvs, nvl, buflen)) != 0) return (err); switch (nvs->nvs_op) { case NVS_OP_ENCODE: err = nvs_encode_pairs(nvs, nvl); break; case NVS_OP_DECODE: err = nvs_decode_pairs(nvs, nvl); break; case NVS_OP_GETSIZE: err = nvs_getsize_pairs(nvs, nvl, buflen); break; default: err = EINVAL; } return (err); } static int nvs_embedded(nvstream_t *nvs, nvlist_t *embedded) { switch (nvs->nvs_op) { case NVS_OP_ENCODE: { int err; if (nvs->nvs_recursion >= nvpair_max_recursion) return (EINVAL); nvs->nvs_recursion++; err = nvs_operation(nvs, embedded, NULL); nvs->nvs_recursion--; return (err); } case NVS_OP_DECODE: { nvpriv_t *priv; int err; if (embedded->nvl_version != NV_VERSION) return (ENOTSUP); if ((priv = nv_priv_alloc_embedded(nvs->nvs_priv)) == NULL) return (ENOMEM); nvlist_init(embedded, embedded->nvl_nvflag, priv); if (nvs->nvs_recursion >= nvpair_max_recursion) { nvlist_free(embedded); return (EINVAL); } nvs->nvs_recursion++; if ((err = nvs_operation(nvs, embedded, NULL)) != 0) nvlist_free(embedded); nvs->nvs_recursion--; return (err); } default: break; } return (EINVAL); } static int nvs_embedded_nvl_array(nvstream_t *nvs, nvpair_t *nvp, size_t *size) { size_t nelem = NVP_NELEM(nvp); nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); int i; switch (nvs->nvs_op) { case NVS_OP_ENCODE: for (i = 0; i < nelem; i++) if (nvs_embedded(nvs, nvlp[i]) != 0) return (EFAULT); break; case NVS_OP_DECODE: { size_t len = nelem * sizeof (uint64_t); nvlist_t *embedded = (nvlist_t *)((uintptr_t)nvlp + len); bzero(nvlp, len); /* don't trust packed data */ for (i = 0; i < nelem; i++) { if (nvs_embedded(nvs, embedded) != 0) { nvpair_free(nvp); return (EFAULT); } nvlp[i] = embedded++; } break; } case NVS_OP_GETSIZE: { uint64_t nvsize = 0; for (i = 0; i < nelem; i++) { size_t nvp_sz = 0; if (nvs_operation(nvs, nvlp[i], &nvp_sz) != 0) return (EINVAL); if ((nvsize += nvp_sz) > INT32_MAX) return (EINVAL); } *size = nvsize; break; } default: return (EINVAL); } return (0); } static int nvs_native(nvstream_t *, nvlist_t *, char *, size_t *); static int nvs_xdr(nvstream_t *, nvlist_t *, char *, size_t *); /* * Common routine for nvlist operations: * encode, decode, getsize (encoded size). */ static int nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding, int nvs_op) { int err = 0; nvstream_t nvs; int nvl_endian; #if defined(_ZFS_LITTLE_ENDIAN) int host_endian = 1; #elif defined(_ZFS_BIG_ENDIAN) int host_endian = 0; #else #error "No endian defined!" #endif /* _ZFS_LITTLE_ENDIAN */ nvs_header_t *nvh; if (buflen == NULL || nvl == NULL || (nvs.nvs_priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (EINVAL); nvs.nvs_op = nvs_op; nvs.nvs_recursion = 0; /* * For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and * a buffer is allocated. The first 4 bytes in the buffer are * used for encoding method and host endian. */ switch (nvs_op) { case NVS_OP_ENCODE: if (buf == NULL || *buflen < sizeof (nvs_header_t)) return (EINVAL); nvh = (void *)buf; nvh->nvh_encoding = encoding; nvh->nvh_endian = nvl_endian = host_endian; nvh->nvh_reserved1 = 0; nvh->nvh_reserved2 = 0; break; case NVS_OP_DECODE: if (buf == NULL || *buflen < sizeof (nvs_header_t)) return (EINVAL); /* get method of encoding from first byte */ nvh = (void *)buf; encoding = nvh->nvh_encoding; nvl_endian = nvh->nvh_endian; break; case NVS_OP_GETSIZE: nvl_endian = host_endian; /* * add the size for encoding */ *buflen = sizeof (nvs_header_t); break; default: return (ENOTSUP); } /* * Create an nvstream with proper encoding method */ switch (encoding) { case NV_ENCODE_NATIVE: /* * check endianness, in case we are unpacking * from a file */ if (nvl_endian != host_endian) return (ENOTSUP); err = nvs_native(&nvs, nvl, buf, buflen); break; case NV_ENCODE_XDR: err = nvs_xdr(&nvs, nvl, buf, buflen); break; default: err = ENOTSUP; break; } return (err); } int nvlist_size(nvlist_t *nvl, size_t *size, int encoding) { return (nvlist_common(nvl, NULL, size, encoding, NVS_OP_GETSIZE)); } /* * Pack nvlist into contiguous memory */ int nvlist_pack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding, int kmflag) { return (nvlist_xpack(nvl, bufp, buflen, encoding, nvlist_nv_alloc(kmflag))); } int nvlist_xpack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding, nv_alloc_t *nva) { nvpriv_t nvpriv; size_t alloc_size; char *buf; int err; if (nva == NULL || nvl == NULL || bufp == NULL || buflen == NULL) return (EINVAL); if (*bufp != NULL) return (nvlist_common(nvl, *bufp, buflen, encoding, NVS_OP_ENCODE)); /* * Here is a difficult situation: * 1. The nvlist has fixed allocator properties. * All other nvlist routines (like nvlist_add_*, ...) use * these properties. * 2. When using nvlist_pack() the user can specify their own * allocator properties (e.g. by using KM_NOSLEEP). * * We use the user specified properties (2). A clearer solution * will be to remove the kmflag from nvlist_pack(), but we will * not change the interface. */ nv_priv_init(&nvpriv, nva, 0); if ((err = nvlist_size(nvl, &alloc_size, encoding))) return (err); if ((buf = nv_mem_zalloc(&nvpriv, alloc_size)) == NULL) return (ENOMEM); if ((err = nvlist_common(nvl, buf, &alloc_size, encoding, NVS_OP_ENCODE)) != 0) { nv_mem_free(&nvpriv, buf, alloc_size); } else { *buflen = alloc_size; *bufp = buf; } return (err); } /* * Unpack buf into an nvlist_t */ int nvlist_unpack(char *buf, size_t buflen, nvlist_t **nvlp, int kmflag) { return (nvlist_xunpack(buf, buflen, nvlp, nvlist_nv_alloc(kmflag))); } int nvlist_xunpack(char *buf, size_t buflen, nvlist_t **nvlp, nv_alloc_t *nva) { nvlist_t *nvl; int err; if (nvlp == NULL) return (EINVAL); if ((err = nvlist_xalloc(&nvl, 0, nva)) != 0) return (err); if ((err = nvlist_common(nvl, buf, &buflen, NV_ENCODE_NATIVE, NVS_OP_DECODE)) != 0) nvlist_free(nvl); else *nvlp = nvl; return (err); } /* * Native encoding functions */ typedef struct { /* * This structure is used when decoding a packed nvpair in * the native format. n_base points to a buffer containing the * packed nvpair. n_end is a pointer to the end of the buffer. * (n_end actually points to the first byte past the end of the * buffer.) n_curr is a pointer that lies between n_base and n_end. * It points to the current data that we are decoding. * The amount of data left in the buffer is equal to n_end - n_curr. * n_flag is used to recognize a packed embedded list. */ caddr_t n_base; caddr_t n_end; caddr_t n_curr; uint_t n_flag; } nvs_native_t; static int nvs_native_create(nvstream_t *nvs, nvs_native_t *native, char *buf, size_t buflen) { switch (nvs->nvs_op) { case NVS_OP_ENCODE: case NVS_OP_DECODE: nvs->nvs_private = native; native->n_curr = native->n_base = buf; native->n_end = buf + buflen; native->n_flag = 0; return (0); case NVS_OP_GETSIZE: nvs->nvs_private = native; native->n_curr = native->n_base = native->n_end = NULL; native->n_flag = 0; return (0); default: return (EINVAL); } } /*ARGSUSED*/ static void nvs_native_destroy(nvstream_t *nvs) { } static int native_cp(nvstream_t *nvs, void *buf, size_t size) { nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; if (native->n_curr + size > native->n_end) return (EFAULT); /* * The bcopy() below eliminates alignment requirement * on the buffer (stream) and is preferred over direct access. */ switch (nvs->nvs_op) { case NVS_OP_ENCODE: bcopy(buf, native->n_curr, size); break; case NVS_OP_DECODE: bcopy(native->n_curr, buf, size); break; default: return (EINVAL); } native->n_curr += size; return (0); } /* * operate on nvlist_t header */ static int nvs_native_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size) { nvs_native_t *native = nvs->nvs_private; switch (nvs->nvs_op) { case NVS_OP_ENCODE: case NVS_OP_DECODE: if (native->n_flag) return (0); /* packed embedded list */ native->n_flag = 1; /* copy version and nvflag of the nvlist_t */ if (native_cp(nvs, &nvl->nvl_version, sizeof (int32_t)) != 0 || native_cp(nvs, &nvl->nvl_nvflag, sizeof (int32_t)) != 0) return (EFAULT); return (0); case NVS_OP_GETSIZE: /* * if calculate for packed embedded list * 4 for end of the embedded list * else * 2 * sizeof (int32_t) for nvl_version and nvl_nvflag * and 4 for end of the entire list */ if (native->n_flag) { *size += 4; } else { native->n_flag = 1; *size += 2 * sizeof (int32_t) + 4; } return (0); default: return (EINVAL); } } static int nvs_native_nvl_fini(nvstream_t *nvs) { if (nvs->nvs_op == NVS_OP_ENCODE) { nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; /* * Add 4 zero bytes at end of nvlist. They are used * for end detection by the decode routine. */ if (native->n_curr + sizeof (int) > native->n_end) return (EFAULT); bzero(native->n_curr, sizeof (int)); native->n_curr += sizeof (int); } return (0); } static int nvpair_native_embedded(nvstream_t *nvs, nvpair_t *nvp) { if (nvs->nvs_op == NVS_OP_ENCODE) { nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; nvlist_t *packed = (void *) (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp)); /* * Null out the pointer that is meaningless in the packed * structure. The address may not be aligned, so we have * to use bzero. */ bzero((char *)packed + offsetof(nvlist_t, nvl_priv), sizeof (uint64_t)); } return (nvs_embedded(nvs, EMBEDDED_NVL(nvp))); } static int nvpair_native_embedded_array(nvstream_t *nvs, nvpair_t *nvp) { if (nvs->nvs_op == NVS_OP_ENCODE) { nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; char *value = native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp); size_t len = NVP_NELEM(nvp) * sizeof (uint64_t); nvlist_t *packed = (nvlist_t *)((uintptr_t)value + len); int i; /* * Null out pointers that are meaningless in the packed * structure. The addresses may not be aligned, so we have * to use bzero. */ bzero(value, len); for (i = 0; i < NVP_NELEM(nvp); i++, packed++) /* * Null out the pointer that is meaningless in the * packed structure. The address may not be aligned, * so we have to use bzero. */ bzero((char *)packed + offsetof(nvlist_t, nvl_priv), sizeof (uint64_t)); } return (nvs_embedded_nvl_array(nvs, nvp, NULL)); } static void nvpair_native_string_array(nvstream_t *nvs, nvpair_t *nvp) { switch (nvs->nvs_op) { case NVS_OP_ENCODE: { nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; uint64_t *strp = (void *) (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp)); /* * Null out pointers that are meaningless in the packed * structure. The addresses may not be aligned, so we have * to use bzero. */ bzero(strp, NVP_NELEM(nvp) * sizeof (uint64_t)); break; } case NVS_OP_DECODE: { char **strp = (void *)NVP_VALUE(nvp); char *buf = ((char *)strp + NVP_NELEM(nvp) * sizeof (uint64_t)); int i; for (i = 0; i < NVP_NELEM(nvp); i++) { strp[i] = buf; buf += strlen(buf) + 1; } break; } } } static int nvs_native_nvp_op(nvstream_t *nvs, nvpair_t *nvp) { data_type_t type; int value_sz; int ret = 0; /* * We do the initial bcopy of the data before we look at * the nvpair type, because when we're decoding, we won't * have the correct values for the pair until we do the bcopy. */ switch (nvs->nvs_op) { case NVS_OP_ENCODE: case NVS_OP_DECODE: if (native_cp(nvs, nvp, nvp->nvp_size) != 0) return (EFAULT); break; default: return (EINVAL); } /* verify nvp_name_sz, check the name string length */ if (i_validate_nvpair_name(nvp) != 0) return (EFAULT); type = NVP_TYPE(nvp); /* * Verify type and nelem and get the value size. * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY * is the size of the string(s) excluded. */ if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0) return (EFAULT); if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size) return (EFAULT); switch (type) { case DATA_TYPE_NVLIST: ret = nvpair_native_embedded(nvs, nvp); break; case DATA_TYPE_NVLIST_ARRAY: ret = nvpair_native_embedded_array(nvs, nvp); break; case DATA_TYPE_STRING_ARRAY: nvpair_native_string_array(nvs, nvp); break; default: break; } return (ret); } static int nvs_native_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size) { uint64_t nvp_sz = nvp->nvp_size; switch (NVP_TYPE(nvp)) { case DATA_TYPE_NVLIST: { size_t nvsize = 0; if (nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize) != 0) return (EINVAL); nvp_sz += nvsize; break; } case DATA_TYPE_NVLIST_ARRAY: { size_t nvsize; if (nvs_embedded_nvl_array(nvs, nvp, &nvsize) != 0) return (EINVAL); nvp_sz += nvsize; break; } default: break; } if (nvp_sz > INT32_MAX) return (EINVAL); *size = nvp_sz; return (0); } static int nvs_native_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size) { switch (nvs->nvs_op) { case NVS_OP_ENCODE: return (nvs_native_nvp_op(nvs, nvp)); case NVS_OP_DECODE: { nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; int32_t decode_len; /* try to read the size value from the stream */ if (native->n_curr + sizeof (int32_t) > native->n_end) return (EFAULT); bcopy(native->n_curr, &decode_len, sizeof (int32_t)); /* sanity check the size value */ if (decode_len < 0 || decode_len > native->n_end - native->n_curr) return (EFAULT); *size = decode_len; /* * If at the end of the stream then move the cursor * forward, otherwise nvpair_native_op() will read * the entire nvpair at the same cursor position. */ if (*size == 0) native->n_curr += sizeof (int32_t); break; } default: return (EINVAL); } return (0); } static const nvs_ops_t nvs_native_ops = { .nvs_nvlist = nvs_native_nvlist, .nvs_nvpair = nvs_native_nvpair, .nvs_nvp_op = nvs_native_nvp_op, .nvs_nvp_size = nvs_native_nvp_size, .nvs_nvl_fini = nvs_native_nvl_fini }; static int nvs_native(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen) { nvs_native_t native; int err; nvs->nvs_ops = &nvs_native_ops; if ((err = nvs_native_create(nvs, &native, buf + sizeof (nvs_header_t), *buflen - sizeof (nvs_header_t))) != 0) return (err); err = nvs_operation(nvs, nvl, buflen); nvs_native_destroy(nvs); return (err); } /* * XDR encoding functions * * An xdr packed nvlist is encoded as: * * - encoding method and host endian (4 bytes) * - nvl_version (4 bytes) * - nvl_nvflag (4 bytes) * * - encoded nvpairs, the format of one xdr encoded nvpair is: * - encoded size of the nvpair (4 bytes) * - decoded size of the nvpair (4 bytes) * - name string, (4 + sizeof(NV_ALIGN4(string)) * a string is coded as size (4 bytes) and data * - data type (4 bytes) * - number of elements in the nvpair (4 bytes) * - data * * - 2 zero's for end of the entire list (8 bytes) */ static int nvs_xdr_create(nvstream_t *nvs, XDR *xdr, char *buf, size_t buflen) { /* xdr data must be 4 byte aligned */ if ((ulong_t)buf % 4 != 0) return (EFAULT); switch (nvs->nvs_op) { case NVS_OP_ENCODE: xdrmem_create(xdr, buf, (uint_t)buflen, XDR_ENCODE); nvs->nvs_private = xdr; return (0); case NVS_OP_DECODE: xdrmem_create(xdr, buf, (uint_t)buflen, XDR_DECODE); nvs->nvs_private = xdr; return (0); case NVS_OP_GETSIZE: nvs->nvs_private = NULL; return (0); default: return (EINVAL); } } static void nvs_xdr_destroy(nvstream_t *nvs) { switch (nvs->nvs_op) { case NVS_OP_ENCODE: case NVS_OP_DECODE: xdr_destroy((XDR *)nvs->nvs_private); break; default: break; } } static int nvs_xdr_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size) { switch (nvs->nvs_op) { case NVS_OP_ENCODE: case NVS_OP_DECODE: { XDR *xdr = nvs->nvs_private; if (!xdr_int(xdr, &nvl->nvl_version) || !xdr_u_int(xdr, &nvl->nvl_nvflag)) return (EFAULT); break; } case NVS_OP_GETSIZE: { /* * 2 * 4 for nvl_version + nvl_nvflag * and 8 for end of the entire list */ *size += 2 * 4 + 8; break; } default: return (EINVAL); } return (0); } static int nvs_xdr_nvl_fini(nvstream_t *nvs) { if (nvs->nvs_op == NVS_OP_ENCODE) { XDR *xdr = nvs->nvs_private; int zero = 0; if (!xdr_int(xdr, &zero) || !xdr_int(xdr, &zero)) return (EFAULT); } return (0); } /* * The format of xdr encoded nvpair is: * encode_size, decode_size, name string, data type, nelem, data */ static int nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp) { data_type_t type; char *buf; char *buf_end = (char *)nvp + nvp->nvp_size; int value_sz; uint_t nelem, buflen; bool_t ret = FALSE; XDR *xdr = nvs->nvs_private; ASSERT(xdr != NULL && nvp != NULL); /* name string */ if ((buf = NVP_NAME(nvp)) >= buf_end) return (EFAULT); buflen = buf_end - buf; if (!xdr_string(xdr, &buf, buflen - 1)) return (EFAULT); nvp->nvp_name_sz = strlen(buf) + 1; /* type and nelem */ if (!xdr_int(xdr, (int *)&nvp->nvp_type) || !xdr_int(xdr, &nvp->nvp_value_elem)) return (EFAULT); type = NVP_TYPE(nvp); nelem = nvp->nvp_value_elem; /* * Verify type and nelem and get the value size. * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY * is the size of the string(s) excluded. */ if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0) return (EFAULT); /* if there is no data to extract then return */ if (nelem == 0) return (0); /* value */ if ((buf = NVP_VALUE(nvp)) >= buf_end) return (EFAULT); buflen = buf_end - buf; if (buflen < value_sz) return (EFAULT); switch (type) { case DATA_TYPE_NVLIST: if (nvs_embedded(nvs, (void *)buf) == 0) return (0); break; case DATA_TYPE_NVLIST_ARRAY: if (nvs_embedded_nvl_array(nvs, nvp, NULL) == 0) return (0); break; case DATA_TYPE_BOOLEAN: ret = TRUE; break; case DATA_TYPE_BYTE: case DATA_TYPE_INT8: case DATA_TYPE_UINT8: ret = xdr_char(xdr, buf); break; case DATA_TYPE_INT16: ret = xdr_short(xdr, (void *)buf); break; case DATA_TYPE_UINT16: ret = xdr_u_short(xdr, (void *)buf); break; case DATA_TYPE_BOOLEAN_VALUE: case DATA_TYPE_INT32: ret = xdr_int(xdr, (void *)buf); break; case DATA_TYPE_UINT32: ret = xdr_u_int(xdr, (void *)buf); break; case DATA_TYPE_INT64: ret = xdr_longlong_t(xdr, (void *)buf); break; case DATA_TYPE_UINT64: ret = xdr_u_longlong_t(xdr, (void *)buf); break; case DATA_TYPE_HRTIME: /* * NOTE: must expose the definition of hrtime_t here */ ret = xdr_longlong_t(xdr, (void *)buf); break; #if !defined(_KERNEL) case DATA_TYPE_DOUBLE: ret = xdr_double(xdr, (void *)buf); break; #endif case DATA_TYPE_STRING: ret = xdr_string(xdr, &buf, buflen - 1); break; case DATA_TYPE_BYTE_ARRAY: ret = xdr_opaque(xdr, buf, nelem); break; case DATA_TYPE_INT8_ARRAY: case DATA_TYPE_UINT8_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t), (xdrproc_t)xdr_char); break; case DATA_TYPE_INT16_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t), sizeof (int16_t), (xdrproc_t)xdr_short); break; case DATA_TYPE_UINT16_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t), sizeof (uint16_t), (xdrproc_t)xdr_u_short); break; case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_INT32_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t), sizeof (int32_t), (xdrproc_t)xdr_int); break; case DATA_TYPE_UINT32_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t), sizeof (uint32_t), (xdrproc_t)xdr_u_int); break; case DATA_TYPE_INT64_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t), sizeof (int64_t), (xdrproc_t)xdr_longlong_t); break; case DATA_TYPE_UINT64_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t), sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t); break; case DATA_TYPE_STRING_ARRAY: { size_t len = nelem * sizeof (uint64_t); char **strp = (void *)buf; int i; if (nvs->nvs_op == NVS_OP_DECODE) bzero(buf, len); /* don't trust packed data */ for (i = 0; i < nelem; i++) { if (buflen <= len) return (EFAULT); buf += len; buflen -= len; if (xdr_string(xdr, &buf, buflen - 1) != TRUE) return (EFAULT); if (nvs->nvs_op == NVS_OP_DECODE) strp[i] = buf; len = strlen(buf) + 1; } ret = TRUE; break; } default: break; } return (ret == TRUE ? 0 : EFAULT); } static int nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size) { data_type_t type = NVP_TYPE(nvp); /* * encode_size + decode_size + name string size + data type + nelem * where name string size = 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) */ uint64_t nvp_sz = 4 + 4 + 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) + 4 + 4; switch (type) { case DATA_TYPE_BOOLEAN: break; case DATA_TYPE_BOOLEAN_VALUE: case DATA_TYPE_BYTE: case DATA_TYPE_INT8: case DATA_TYPE_UINT8: case DATA_TYPE_INT16: case DATA_TYPE_UINT16: case DATA_TYPE_INT32: case DATA_TYPE_UINT32: nvp_sz += 4; /* 4 is the minimum xdr unit */ break; case DATA_TYPE_INT64: case DATA_TYPE_UINT64: case DATA_TYPE_HRTIME: #if !defined(_KERNEL) case DATA_TYPE_DOUBLE: #endif nvp_sz += 8; break; case DATA_TYPE_STRING: nvp_sz += 4 + NV_ALIGN4(strlen((char *)NVP_VALUE(nvp))); break; case DATA_TYPE_BYTE_ARRAY: nvp_sz += NV_ALIGN4(NVP_NELEM(nvp)); break; case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_INT8_ARRAY: case DATA_TYPE_UINT8_ARRAY: case DATA_TYPE_INT16_ARRAY: case DATA_TYPE_UINT16_ARRAY: case DATA_TYPE_INT32_ARRAY: case DATA_TYPE_UINT32_ARRAY: nvp_sz += 4 + 4 * (uint64_t)NVP_NELEM(nvp); break; case DATA_TYPE_INT64_ARRAY: case DATA_TYPE_UINT64_ARRAY: nvp_sz += 4 + 8 * (uint64_t)NVP_NELEM(nvp); break; case DATA_TYPE_STRING_ARRAY: { int i; char **strs = (void *)NVP_VALUE(nvp); for (i = 0; i < NVP_NELEM(nvp); i++) nvp_sz += 4 + NV_ALIGN4(strlen(strs[i])); break; } case DATA_TYPE_NVLIST: case DATA_TYPE_NVLIST_ARRAY: { size_t nvsize = 0; int old_nvs_op = nvs->nvs_op; int err; nvs->nvs_op = NVS_OP_GETSIZE; if (type == DATA_TYPE_NVLIST) err = nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize); else err = nvs_embedded_nvl_array(nvs, nvp, &nvsize); nvs->nvs_op = old_nvs_op; if (err != 0) return (EINVAL); nvp_sz += nvsize; break; } default: return (EINVAL); } if (nvp_sz > INT32_MAX) return (EINVAL); *size = nvp_sz; return (0); } /* * The NVS_XDR_MAX_LEN macro takes a packed xdr buffer of size x and estimates * the largest nvpair that could be encoded in the buffer. * * See comments above nvpair_xdr_op() for the format of xdr encoding. * The size of a xdr packed nvpair without any data is 5 words. * * Using the size of the data directly as an estimate would be ok * in all cases except one. If the data type is of DATA_TYPE_STRING_ARRAY * then the actual nvpair has space for an array of pointers to index * the strings. These pointers are not encoded into the packed xdr buffer. * * If the data is of type DATA_TYPE_STRING_ARRAY and all the strings are * of length 0, then each string is encoded in xdr format as a single word. * Therefore when expanded to an nvpair there will be 2.25 word used for * each string. (a int64_t allocated for pointer usage, and a single char * for the null termination.) * * This is the calculation performed by the NVS_XDR_MAX_LEN macro. */ #define NVS_XDR_HDR_LEN ((size_t)(5 * 4)) #define NVS_XDR_DATA_LEN(y) (((size_t)(y) <= NVS_XDR_HDR_LEN) ? \ 0 : ((size_t)(y) - NVS_XDR_HDR_LEN)) #define NVS_XDR_MAX_LEN(x) (NVP_SIZE_CALC(1, 0) + \ (NVS_XDR_DATA_LEN(x) * 2) + \ NV_ALIGN4((NVS_XDR_DATA_LEN(x) / 4))) static int nvs_xdr_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size) { XDR *xdr = nvs->nvs_private; int32_t encode_len, decode_len; switch (nvs->nvs_op) { case NVS_OP_ENCODE: { size_t nvsize; if (nvs_xdr_nvp_size(nvs, nvp, &nvsize) != 0) return (EFAULT); decode_len = nvp->nvp_size; encode_len = nvsize; if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len)) return (EFAULT); return (nvs_xdr_nvp_op(nvs, nvp)); } case NVS_OP_DECODE: { struct xdr_bytesrec bytesrec; /* get the encode and decode size */ if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len)) return (EFAULT); *size = decode_len; /* are we at the end of the stream? */ if (*size == 0) return (0); /* sanity check the size parameter */ if (!xdr_control(xdr, XDR_GET_BYTES_AVAIL, &bytesrec)) return (EFAULT); if (*size > NVS_XDR_MAX_LEN(bytesrec.xc_num_avail)) return (EFAULT); break; } default: return (EINVAL); } return (0); } static const struct nvs_ops nvs_xdr_ops = { .nvs_nvlist = nvs_xdr_nvlist, .nvs_nvpair = nvs_xdr_nvpair, .nvs_nvp_op = nvs_xdr_nvp_op, .nvs_nvp_size = nvs_xdr_nvp_size, .nvs_nvl_fini = nvs_xdr_nvl_fini }; static int nvs_xdr(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen) { XDR xdr; int err; nvs->nvs_ops = &nvs_xdr_ops; if ((err = nvs_xdr_create(nvs, &xdr, buf + sizeof (nvs_header_t), *buflen - sizeof (nvs_header_t))) != 0) return (err); err = nvs_operation(nvs, nvl, buflen); nvs_xdr_destroy(nvs); return (err); } #if defined(_KERNEL) static int __init nvpair_init(void) { return (0); } static void __exit nvpair_fini(void) { } module_init(nvpair_init); module_exit(nvpair_fini); #endif ZFS_MODULE_DESCRIPTION("Generic name/value pair implementation"); ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); ZFS_MODULE_LICENSE(ZFS_META_LICENSE); ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); EXPORT_SYMBOL(nv_alloc_init); EXPORT_SYMBOL(nv_alloc_reset); EXPORT_SYMBOL(nv_alloc_fini); /* list management */ EXPORT_SYMBOL(nvlist_alloc); EXPORT_SYMBOL(nvlist_free); EXPORT_SYMBOL(nvlist_size); EXPORT_SYMBOL(nvlist_pack); EXPORT_SYMBOL(nvlist_unpack); EXPORT_SYMBOL(nvlist_dup); EXPORT_SYMBOL(nvlist_merge); EXPORT_SYMBOL(nvlist_xalloc); EXPORT_SYMBOL(nvlist_xpack); EXPORT_SYMBOL(nvlist_xunpack); EXPORT_SYMBOL(nvlist_xdup); EXPORT_SYMBOL(nvlist_lookup_nv_alloc); EXPORT_SYMBOL(nvlist_add_nvpair); EXPORT_SYMBOL(nvlist_add_boolean); EXPORT_SYMBOL(nvlist_add_boolean_value); EXPORT_SYMBOL(nvlist_add_byte); EXPORT_SYMBOL(nvlist_add_int8); EXPORT_SYMBOL(nvlist_add_uint8); EXPORT_SYMBOL(nvlist_add_int16); EXPORT_SYMBOL(nvlist_add_uint16); EXPORT_SYMBOL(nvlist_add_int32); EXPORT_SYMBOL(nvlist_add_uint32); EXPORT_SYMBOL(nvlist_add_int64); EXPORT_SYMBOL(nvlist_add_uint64); EXPORT_SYMBOL(nvlist_add_string); EXPORT_SYMBOL(nvlist_add_nvlist); EXPORT_SYMBOL(nvlist_add_boolean_array); EXPORT_SYMBOL(nvlist_add_byte_array); EXPORT_SYMBOL(nvlist_add_int8_array); EXPORT_SYMBOL(nvlist_add_uint8_array); EXPORT_SYMBOL(nvlist_add_int16_array); EXPORT_SYMBOL(nvlist_add_uint16_array); EXPORT_SYMBOL(nvlist_add_int32_array); EXPORT_SYMBOL(nvlist_add_uint32_array); EXPORT_SYMBOL(nvlist_add_int64_array); EXPORT_SYMBOL(nvlist_add_uint64_array); EXPORT_SYMBOL(nvlist_add_string_array); EXPORT_SYMBOL(nvlist_add_nvlist_array); EXPORT_SYMBOL(nvlist_next_nvpair); EXPORT_SYMBOL(nvlist_prev_nvpair); EXPORT_SYMBOL(nvlist_empty); EXPORT_SYMBOL(nvlist_add_hrtime); EXPORT_SYMBOL(nvlist_remove); EXPORT_SYMBOL(nvlist_remove_nvpair); EXPORT_SYMBOL(nvlist_remove_all); EXPORT_SYMBOL(nvlist_lookup_boolean); EXPORT_SYMBOL(nvlist_lookup_boolean_value); EXPORT_SYMBOL(nvlist_lookup_byte); EXPORT_SYMBOL(nvlist_lookup_int8); EXPORT_SYMBOL(nvlist_lookup_uint8); EXPORT_SYMBOL(nvlist_lookup_int16); EXPORT_SYMBOL(nvlist_lookup_uint16); EXPORT_SYMBOL(nvlist_lookup_int32); EXPORT_SYMBOL(nvlist_lookup_uint32); EXPORT_SYMBOL(nvlist_lookup_int64); EXPORT_SYMBOL(nvlist_lookup_uint64); EXPORT_SYMBOL(nvlist_lookup_string); EXPORT_SYMBOL(nvlist_lookup_nvlist); EXPORT_SYMBOL(nvlist_lookup_boolean_array); EXPORT_SYMBOL(nvlist_lookup_byte_array); EXPORT_SYMBOL(nvlist_lookup_int8_array); EXPORT_SYMBOL(nvlist_lookup_uint8_array); EXPORT_SYMBOL(nvlist_lookup_int16_array); EXPORT_SYMBOL(nvlist_lookup_uint16_array); EXPORT_SYMBOL(nvlist_lookup_int32_array); EXPORT_SYMBOL(nvlist_lookup_uint32_array); EXPORT_SYMBOL(nvlist_lookup_int64_array); EXPORT_SYMBOL(nvlist_lookup_uint64_array); EXPORT_SYMBOL(nvlist_lookup_string_array); EXPORT_SYMBOL(nvlist_lookup_nvlist_array); EXPORT_SYMBOL(nvlist_lookup_hrtime); EXPORT_SYMBOL(nvlist_lookup_pairs); EXPORT_SYMBOL(nvlist_lookup_nvpair); EXPORT_SYMBOL(nvlist_exists); /* processing nvpair */ EXPORT_SYMBOL(nvpair_name); EXPORT_SYMBOL(nvpair_type); EXPORT_SYMBOL(nvpair_value_boolean_value); EXPORT_SYMBOL(nvpair_value_byte); EXPORT_SYMBOL(nvpair_value_int8); EXPORT_SYMBOL(nvpair_value_uint8); EXPORT_SYMBOL(nvpair_value_int16); EXPORT_SYMBOL(nvpair_value_uint16); EXPORT_SYMBOL(nvpair_value_int32); EXPORT_SYMBOL(nvpair_value_uint32); EXPORT_SYMBOL(nvpair_value_int64); EXPORT_SYMBOL(nvpair_value_uint64); EXPORT_SYMBOL(nvpair_value_string); EXPORT_SYMBOL(nvpair_value_nvlist); EXPORT_SYMBOL(nvpair_value_boolean_array); EXPORT_SYMBOL(nvpair_value_byte_array); EXPORT_SYMBOL(nvpair_value_int8_array); EXPORT_SYMBOL(nvpair_value_uint8_array); EXPORT_SYMBOL(nvpair_value_int16_array); EXPORT_SYMBOL(nvpair_value_uint16_array); EXPORT_SYMBOL(nvpair_value_int32_array); EXPORT_SYMBOL(nvpair_value_uint32_array); EXPORT_SYMBOL(nvpair_value_int64_array); EXPORT_SYMBOL(nvpair_value_uint64_array); EXPORT_SYMBOL(nvpair_value_string_array); EXPORT_SYMBOL(nvpair_value_nvlist_array); EXPORT_SYMBOL(nvpair_value_hrtime); Index: vendor-sys/openzfs/dist/module/os/freebsd/spl/spl_kstat.c =================================================================== --- vendor-sys/openzfs/dist/module/os/freebsd/spl/spl_kstat.c (revision 365339) +++ vendor-sys/openzfs/dist/module/os/freebsd/spl/spl_kstat.c (revision 365340) @@ -1,351 +1,519 @@ /* * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. + * + * Links to Illumos.org for more information on kstat function: + * [1] https://illumos.org/man/1M/kstat + * [2] https://illumos.org/man/9f/kstat_create */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include +#include static MALLOC_DEFINE(M_KSTAT, "kstat_data", "Kernel statistics"); SYSCTL_ROOT_NODE(OID_AUTO, kstat, CTLFLAG_RW, 0, "Kernel statistics"); void __kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), int (*data)(char *buf, size_t size, void *data), void *(*addr)(kstat_t *ksp, loff_t index)) { ksp->ks_raw_ops.headers = headers; ksp->ks_raw_ops.data = data; ksp->ks_raw_ops.addr = addr; } static int kstat_default_update(kstat_t *ksp, int rw) { ASSERT(ksp != NULL); if (rw == KSTAT_WRITE) return (EACCES); return (0); } +static int +kstat_resize_raw(kstat_t *ksp) +{ + if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX) + return (ENOMEM); + + free(ksp->ks_raw_buf, M_TEMP); + ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX); + ksp->ks_raw_buf = malloc(ksp->ks_raw_bufsize, M_TEMP, M_WAITOK); + + return (0); +} + +static void * +kstat_raw_default_addr(kstat_t *ksp, loff_t n) +{ + if (n == 0) + return (ksp->ks_data); + return (NULL); +} + +static int +kstat_sysctl(SYSCTL_HANDLER_ARGS) +{ + kstat_t *ksp = arg1; + kstat_named_t *ksent; + uint64_t val; + + ksent = ksp->ks_data; + /* Select the correct element */ + ksent += arg2; + /* Update the aggsums before reading */ + (void) ksp->ks_update(ksp, KSTAT_READ); + val = ksent->value.ui64; + + return (sysctl_handle_64(oidp, &val, 0, req)); +} + +static int +kstat_sysctl_string(SYSCTL_HANDLER_ARGS) +{ + kstat_t *ksp = arg1; + kstat_named_t *ksent = ksp->ks_data; + char *val; + uint32_t len = 0; + + /* Select the correct element */ + ksent += arg2; + /* Update the aggsums before reading */ + (void) ksp->ks_update(ksp, KSTAT_READ); + val = KSTAT_NAMED_STR_PTR(ksent); + len = KSTAT_NAMED_STR_BUFLEN(ksent); + val[len-1] = '\0'; + + return (sysctl_handle_string(oidp, val, len, req)); +} + +static int +kstat_sysctl_io(SYSCTL_HANDLER_ARGS) +{ + struct sbuf *sb; + kstat_t *ksp = arg1; + kstat_io_t *kip = ksp->ks_data; + int rc; + + sb = sbuf_new_auto(); + if (sb == NULL) + return (ENOMEM); + /* Update the aggsums before reading */ + (void) ksp->ks_update(ksp, KSTAT_READ); + + /* though wlentime & friends are signed, they will never be negative */ + sbuf_printf(sb, + "%-8llu %-8llu %-8u %-8u %-8llu %-8llu " + "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n", + kip->nread, kip->nwritten, + kip->reads, kip->writes, + kip->wtime, kip->wlentime, kip->wlastupdate, + kip->rtime, kip->rlentime, kip->rlastupdate, + kip->wcnt, kip->rcnt); + rc = sbuf_finish(sb); + if (rc == 0) + rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); + sbuf_delete(sb); + return (rc); +} + +static int +kstat_sysctl_raw(SYSCTL_HANDLER_ARGS) +{ + struct sbuf *sb; + void *data; + kstat_t *ksp = arg1; + void *(*addr_op)(kstat_t *ksp, loff_t index); + int n, rc = 0; + + sb = sbuf_new_auto(); + if (sb == NULL) + return (ENOMEM); + + if (ksp->ks_raw_ops.addr) + addr_op = ksp->ks_raw_ops.addr; + else + addr_op = kstat_raw_default_addr; + + mutex_enter(ksp->ks_lock); + + /* Update the aggsums before reading */ + (void) ksp->ks_update(ksp, KSTAT_READ); + + ksp->ks_raw_bufsize = PAGE_SIZE; + ksp->ks_raw_buf = malloc(PAGE_SIZE, M_TEMP, M_WAITOK); + + n = 0; +restart_headers: + if (ksp->ks_raw_ops.headers) { + rc = ksp->ks_raw_ops.headers( + ksp->ks_raw_buf, ksp->ks_raw_bufsize); + if (rc == ENOMEM && !kstat_resize_raw(ksp)) + goto restart_headers; + if (rc == 0) + sbuf_printf(sb, "%s", ksp->ks_raw_buf); + } + + while ((data = addr_op(ksp, n)) != NULL) { +restart: + if (ksp->ks_raw_ops.data) { + rc = ksp->ks_raw_ops.data(ksp->ks_raw_buf, + ksp->ks_raw_bufsize, data); + if (rc == ENOMEM && !kstat_resize_raw(ksp)) + goto restart; + if (rc == 0) + sbuf_printf(sb, "%s", ksp->ks_raw_buf); + + } else { + ASSERT(ksp->ks_ndata == 1); + sbuf_hexdump(sb, ksp->ks_data, + ksp->ks_data_size, NULL, 0); + } + n++; + } + free(ksp->ks_raw_buf, M_TEMP); + mutex_exit(ksp->ks_lock); + rc = sbuf_finish(sb); + if (rc == 0) + rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); + sbuf_delete(sb); + return (rc); +} + kstat_t * __kstat_create(const char *module, int instance, const char *name, const char *class, uchar_t ks_type, uint_t ks_ndata, uchar_t flags) { struct sysctl_oid *root; kstat_t *ksp; KASSERT(instance == 0, ("instance=%d", instance)); if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO)) ASSERT(ks_ndata == 1); /* * Allocate the main structure. We don't need to copy module/class/name * stuff in here, because it is only used for sysctl node creation * done in this function. */ ksp = malloc(sizeof (*ksp), M_KSTAT, M_WAITOK|M_ZERO); ksp->ks_crtime = gethrtime(); ksp->ks_snaptime = ksp->ks_crtime; ksp->ks_instance = instance; strncpy(ksp->ks_name, name, KSTAT_STRLEN); strncpy(ksp->ks_class, class, KSTAT_STRLEN); ksp->ks_type = ks_type; ksp->ks_flags = flags; ksp->ks_update = kstat_default_update; + mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL); + ksp->ks_lock = &ksp->ks_private_lock; + switch (ksp->ks_type) { case KSTAT_TYPE_RAW: ksp->ks_ndata = 1; ksp->ks_data_size = ks_ndata; break; case KSTAT_TYPE_NAMED: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t); break; case KSTAT_TYPE_INTR: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t); break; case KSTAT_TYPE_IO: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t); break; case KSTAT_TYPE_TIMER: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t); break; default: panic("Undefined kstat type %d\n", ksp->ks_type); } if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) { ksp->ks_data = NULL; } else { ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP); if (ksp->ks_data == NULL) { kmem_free(ksp, sizeof (*ksp)); ksp = NULL; } } /* * Create sysctl tree for those statistics: * * kstat.... */ sysctl_ctx_init(&ksp->ks_sysctl_ctx); root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kstat), OID_AUTO, module, CTLFLAG_RW, 0, ""); if (root == NULL) { printf("%s: Cannot create kstat.%s tree!\n", __func__, module); sysctl_ctx_free(&ksp->ks_sysctl_ctx); free(ksp, M_KSTAT); return (NULL); } root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root), OID_AUTO, class, CTLFLAG_RW, 0, ""); if (root == NULL) { printf("%s: Cannot create kstat.%s.%s tree!\n", __func__, module, class); sysctl_ctx_free(&ksp->ks_sysctl_ctx); free(ksp, M_KSTAT); return (NULL); } - root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root), - OID_AUTO, name, CTLFLAG_RW, 0, ""); - if (root == NULL) { - printf("%s: Cannot create kstat.%s.%s.%s tree!\n", __func__, - module, class, name); - sysctl_ctx_free(&ksp->ks_sysctl_ctx); - free(ksp, M_KSTAT); - return (NULL); + if (ksp->ks_type == KSTAT_TYPE_NAMED) { + root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(root), + OID_AUTO, name, CTLFLAG_RW, 0, ""); + if (root == NULL) { + printf("%s: Cannot create kstat.%s.%s.%s tree!\n", + __func__, module, class, name); + sysctl_ctx_free(&ksp->ks_sysctl_ctx); + free(ksp, M_KSTAT); + return (NULL); + } + } ksp->ks_sysctl_root = root; return (ksp); } -static int -kstat_sysctl(SYSCTL_HANDLER_ARGS) +static void +kstat_install_named(kstat_t *ksp) { - kstat_t *ksp = arg1; - kstat_named_t *ksent = ksp->ks_data; - uint64_t val; - - /* Select the correct element */ - ksent += arg2; - /* Update the aggsums before reading */ - (void) ksp->ks_update(ksp, KSTAT_READ); - val = ksent->value.ui64; - - return (sysctl_handle_64(oidp, &val, 0, req)); -} - -static int -kstat_sysctl_string(SYSCTL_HANDLER_ARGS) -{ - kstat_t *ksp = arg1; - kstat_named_t *ksent = ksp->ks_data; - char *val; - uint32_t len = 0; - - /* Select the correct element */ - ksent += arg2; - /* Update the aggsums before reading */ - (void) ksp->ks_update(ksp, KSTAT_READ); - val = KSTAT_NAMED_STR_PTR(ksent); - len = KSTAT_NAMED_STR_BUFLEN(ksent); - val[len-1] = '\0'; - - return (sysctl_handle_string(oidp, val, len, req)); -} - -void -kstat_install(kstat_t *ksp) -{ kstat_named_t *ksent; char *namelast; int typelast; ksent = ksp->ks_data; - if (ksp->ks_ndata == UINT32_MAX) { -#ifdef INVARIANTS - printf("can't handle raw ops yet!!!\n"); -#endif - return; - } - if (ksent == NULL) { - printf("%s ksp->ks_data == NULL!!!!\n", __func__); - return; - } + + VERIFY((ksp->ks_flags & KSTAT_FLAG_VIRTUAL) || ksent != NULL); + typelast = 0; namelast = NULL; + for (int i = 0; i < ksp->ks_ndata; i++, ksent++) { if (ksent->data_type != 0) { typelast = ksent->data_type; namelast = ksent->name; } switch (typelast) { case KSTAT_DATA_CHAR: /* Not Implemented */ break; case KSTAT_DATA_INT32: SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, namelast, CTLTYPE_S32 | CTLFLAG_RD, ksp, i, kstat_sysctl, "I", namelast); break; case KSTAT_DATA_UINT32: SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, namelast, CTLTYPE_U32 | CTLFLAG_RD, ksp, i, kstat_sysctl, "IU", namelast); break; case KSTAT_DATA_INT64: SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, namelast, CTLTYPE_S64 | CTLFLAG_RD, ksp, i, kstat_sysctl, "Q", namelast); break; case KSTAT_DATA_UINT64: SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, namelast, CTLTYPE_U64 | CTLFLAG_RD, ksp, i, kstat_sysctl, "QU", namelast); break; case KSTAT_DATA_LONG: SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, namelast, CTLTYPE_LONG | CTLFLAG_RD, ksp, i, kstat_sysctl, "L", namelast); break; case KSTAT_DATA_ULONG: SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, namelast, CTLTYPE_ULONG | CTLFLAG_RD, ksp, i, kstat_sysctl, "LU", namelast); break; case KSTAT_DATA_STRING: SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, namelast, CTLTYPE_STRING | CTLFLAG_RD, ksp, i, kstat_sysctl_string, "A", namelast); break; default: panic("unsupported type: %d", typelast); } } + } void +kstat_install(kstat_t *ksp) +{ + struct sysctl_oid *root; + + if (ksp->ks_ndata == UINT32_MAX) + VERIFY(ksp->ks_type == KSTAT_TYPE_RAW); + + switch (ksp->ks_type) { + case KSTAT_TYPE_NAMED: + return (kstat_install_named(ksp)); + break; + case KSTAT_TYPE_RAW: + if (ksp->ks_raw_ops.data) { + root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, ksp->ks_name, + CTLTYPE_STRING | CTLFLAG_RD, ksp, 0, + kstat_sysctl_raw, "A", ksp->ks_name); + } else { + root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, ksp->ks_name, + CTLTYPE_OPAQUE | CTLFLAG_RD, ksp, 0, + kstat_sysctl_raw, "", ksp->ks_name); + } + VERIFY(root != NULL); + break; + case KSTAT_TYPE_IO: + root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, ksp->ks_name, + CTLTYPE_STRING | CTLFLAG_RD, ksp, 0, + kstat_sysctl_io, "A", ksp->ks_name); + break; + case KSTAT_TYPE_TIMER: + case KSTAT_TYPE_INTR: + default: + panic("unsupported kstat type %d\n", ksp->ks_type); + } + ksp->ks_sysctl_root = root; + +} + +void kstat_delete(kstat_t *ksp) { sysctl_ctx_free(&ksp->ks_sysctl_ctx); + ksp->ks_lock = NULL; + mutex_destroy(&ksp->ks_private_lock); free(ksp, M_KSTAT); } void kstat_waitq_enter(kstat_io_t *kiop) { hrtime_t new, delta; ulong_t wcnt; new = gethrtime(); delta = new - kiop->wlastupdate; kiop->wlastupdate = new; wcnt = kiop->wcnt++; if (wcnt != 0) { kiop->wlentime += delta * wcnt; kiop->wtime += delta; } } void kstat_waitq_exit(kstat_io_t *kiop) { hrtime_t new, delta; ulong_t wcnt; new = gethrtime(); delta = new - kiop->wlastupdate; kiop->wlastupdate = new; wcnt = kiop->wcnt--; ASSERT((int)wcnt > 0); kiop->wlentime += delta * wcnt; kiop->wtime += delta; } void kstat_runq_enter(kstat_io_t *kiop) { hrtime_t new, delta; ulong_t rcnt; new = gethrtime(); delta = new - kiop->rlastupdate; kiop->rlastupdate = new; rcnt = kiop->rcnt++; if (rcnt != 0) { kiop->rlentime += delta * rcnt; kiop->rtime += delta; } } void kstat_runq_exit(kstat_io_t *kiop) { hrtime_t new, delta; ulong_t rcnt; new = gethrtime(); delta = new - kiop->rlastupdate; kiop->rlastupdate = new; rcnt = kiop->rcnt--; ASSERT((int)rcnt > 0); kiop->rlentime += delta * rcnt; kiop->rtime += delta; } Index: vendor-sys/openzfs/dist/module/os/freebsd/spl/spl_zone.c =================================================================== --- vendor-sys/openzfs/dist/module/os/freebsd/spl/spl_zone.c (revision 365339) +++ vendor-sys/openzfs/dist/module/os/freebsd/spl/spl_zone.c (revision 365340) @@ -1,266 +1,260 @@ /* * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_ZONES, "zones_data", "Zones data"); /* * Structure to record list of ZFS datasets exported to a zone. */ typedef struct zone_dataset { LIST_ENTRY(zone_dataset) zd_next; char zd_dataset[0]; } zone_dataset_t; LIST_HEAD(zone_dataset_head, zone_dataset); static int zone_slot; int zone_dataset_attach(struct ucred *cred, const char *dataset, int jailid) { struct zone_dataset_head *head; zone_dataset_t *zd, *zd2; struct prison *pr; int dofree, error; if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0) return (error); /* Allocate memory before we grab prison's mutex. */ zd = malloc(sizeof (*zd) + strlen(dataset) + 1, M_ZONES, M_WAITOK); sx_slock(&allprison_lock); pr = prison_find(jailid); /* Locks &pr->pr_mtx. */ sx_sunlock(&allprison_lock); if (pr == NULL) { free(zd, M_ZONES); return (ENOENT); } head = osd_jail_get(pr, zone_slot); if (head != NULL) { dofree = 0; LIST_FOREACH(zd2, head, zd_next) { if (strcmp(dataset, zd2->zd_dataset) == 0) { free(zd, M_ZONES); error = EEXIST; goto end; } } } else { dofree = 1; prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); head = malloc(sizeof (*head), M_ZONES, M_WAITOK); LIST_INIT(head); mtx_lock(&pr->pr_mtx); error = osd_jail_set(pr, zone_slot, head); KASSERT(error == 0, ("osd_jail_set() failed (error=%d)", error)); } strcpy(zd->zd_dataset, dataset); LIST_INSERT_HEAD(head, zd, zd_next); end: if (dofree) prison_free_locked(pr); else mtx_unlock(&pr->pr_mtx); return (error); } int zone_dataset_detach(struct ucred *cred, const char *dataset, int jailid) { struct zone_dataset_head *head; zone_dataset_t *zd; struct prison *pr; int error; if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0) return (error); sx_slock(&allprison_lock); pr = prison_find(jailid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENOENT); head = osd_jail_get(pr, zone_slot); if (head == NULL) { error = ENOENT; goto end; } LIST_FOREACH(zd, head, zd_next) { if (strcmp(dataset, zd->zd_dataset) == 0) break; } if (zd == NULL) error = ENOENT; else { LIST_REMOVE(zd, zd_next); free(zd, M_ZONES); if (LIST_EMPTY(head)) osd_jail_del(pr, zone_slot); error = 0; } end: mtx_unlock(&pr->pr_mtx); return (error); } /* * Returns true if the named dataset is visible in the current zone. * The 'write' parameter is set to 1 if the dataset is also writable. */ int zone_dataset_visible(const char *dataset, int *write) { struct zone_dataset_head *head; zone_dataset_t *zd; struct prison *pr; size_t len; int ret = 0; if (dataset[0] == '\0') return (0); if (INGLOBALZONE(curproc)) { if (write != NULL) *write = 1; return (1); } pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); head = osd_jail_get(pr, zone_slot); if (head == NULL) goto end; /* * Walk the list once, looking for datasets which match exactly, or * specify a dataset underneath an exported dataset. If found, return * true and note that it is writable. */ LIST_FOREACH(zd, head, zd_next) { len = strlen(zd->zd_dataset); if (strlen(dataset) >= len && bcmp(dataset, zd->zd_dataset, len) == 0 && (dataset[len] == '\0' || dataset[len] == '/' || dataset[len] == '@')) { if (write) *write = 1; ret = 1; goto end; } } /* * Walk the list a second time, searching for datasets which are parents * of exported datasets. These should be visible, but read-only. * * Note that we also have to support forms such as 'pool/dataset/', with * a trailing slash. */ LIST_FOREACH(zd, head, zd_next) { len = strlen(dataset); if (dataset[len - 1] == '/') len--; /* Ignore trailing slash */ if (len < strlen(zd->zd_dataset) && bcmp(dataset, zd->zd_dataset, len) == 0 && zd->zd_dataset[len] == '/') { if (write) *write = 0; ret = 1; goto end; } } end: mtx_unlock(&pr->pr_mtx); return (ret); } static void zone_destroy(void *arg) { struct zone_dataset_head *head; zone_dataset_t *zd; head = arg; while ((zd = LIST_FIRST(head)) != NULL) { LIST_REMOVE(zd, zd_next); free(zd, M_ZONES); } free(head, M_ZONES); } uint32_t zone_get_hostid(void *ptr) { KASSERT(ptr == NULL, ("only NULL pointer supported in %s", __func__)); return ((uint32_t)curthread->td_ucred->cr_prison->pr_hostid); } -boolean_t -in_globalzone(struct proc *p) -{ - return (!jailed(FIRST_THREAD_IN_PROC((p))->td_ucred)); -} - static void zone_sysinit(void *arg __unused) { zone_slot = osd_jail_register(zone_destroy, NULL); } static void zone_sysuninit(void *arg __unused) { osd_jail_deregister(zone_slot); } SYSINIT(zone_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysinit, NULL); SYSUNINIT(zone_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysuninit, NULL); Index: vendor-sys/openzfs/dist/module/os/freebsd/zfs/spa_stats.c =================================================================== --- vendor-sys/openzfs/dist/module/os/freebsd/zfs/spa_stats.c (revision 365339) +++ vendor-sys/openzfs/dist/module/os/freebsd/zfs/spa_stats.c (revision 365340) @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2020 iXsystems, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - */ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include - -void -spa_stats_init(spa_t *spa) -{ - -} - -void -spa_stats_destroy(spa_t *spa) -{ - -} - -void -spa_iostats_trim_add(spa_t *spa, trim_type_t type, - uint64_t extents_written, uint64_t bytes_written, - uint64_t extents_skipped, uint64_t bytes_skipped, - uint64_t extents_failed, uint64_t bytes_failed) -{ -} - -void -spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) -{ -} - -void -spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) -{ - -} -/* - * Set txg state completion time and increment current state. - */ -int -spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, - hrtime_t completed_time) -{ - return (0); -} - -txg_stat_t * -spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp) -{ - return (NULL); -} - -void -spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts) -{ - -} - -void -spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) -{ - -} - -void -spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, - uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id, - int error) -{ - -} - -int -spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, - hrtime_t duration) -{ - return (0); -} - -int -spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id) -{ - return (0); -} Index: vendor-sys/openzfs/dist/module/os/freebsd/zfs/zfs_vfsops.c =================================================================== --- vendor-sys/openzfs/dist/module/os/freebsd/zfs/zfs_vfsops.c (revision 365339) +++ vendor-sys/openzfs/dist/module/os/freebsd/zfs/zfs_vfsops.c (revision 365340) @@ -1,2482 +1,2289 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_comutil.h" #ifndef MNTK_VMSETSIZE_BUG #define MNTK_VMSETSIZE_BUG 0 #endif #ifndef MNTK_NOMSYNC #define MNTK_NOMSYNC 8 #endif /* BEGIN CSTYLED */ struct mtx zfs_debug_mtx; MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); int zfs_super_owner; SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, "File system owner can perform privileged operation on his file systems"); int zfs_debug_level; SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, "Debug level"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); static int zfs_version_acl = ZFS_ACL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, "ZFS_ACL_VERSION"); static int zfs_version_spa = SPA_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, "SPA_VERSION"); static int zfs_version_zpl = ZPL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, "ZPL_VERSION"); /* END CSTYLED */ static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); static int zfs_mount(vfs_t *vfsp); static int zfs_umount(vfs_t *vfsp, int fflag); static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); static int zfs_sync(vfs_t *vfsp, int waitfor); #if __FreeBSD_version >= 1300098 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, struct ucred **credanonp, int *numsecflavors, int *secflavors); #else static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors); #endif static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); static void zfs_freevfs(vfs_t *vfsp); struct vfsops zfs_vfsops = { .vfs_mount = zfs_mount, .vfs_unmount = zfs_umount, #if __FreeBSD_version >= 1300049 .vfs_root = vfs_cache_root, .vfs_cachedroot = zfs_root, #else .vfs_root = zfs_root, #endif .vfs_statfs = zfs_statfs, .vfs_vget = zfs_vget, .vfs_sync = zfs_sync, .vfs_checkexp = zfs_checkexp, .vfs_fhtovp = zfs_fhtovp, .vfs_quotactl = zfs_quotactl, }; VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); /* * We need to keep a count of active fs's. * This is necessary to prevent our module * from being unloaded after a umount -f */ static uint32_t zfs_active_fs_count = 0; int zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint) { int error; zfsvfs_t *zfvp; vfs_t *vfsp; objset_t *os; uint64_t tmp = *val; error = dmu_objset_from_ds(ds, &os); if (error != 0) return (error); error = getzfsvfs_impl(os, &zfvp); if (error != 0) return (error); if (zfvp == NULL) return (ENOENT); vfsp = zfvp->z_vfs; switch (zfs_prop) { case ZFS_PROP_ATIME: if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) tmp = 1; break; case ZFS_PROP_DEVICES: if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) tmp = 1; break; case ZFS_PROP_EXEC: if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) tmp = 1; break; case ZFS_PROP_SETUID: if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) tmp = 1; break; case ZFS_PROP_READONLY: if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) tmp = 1; break; case ZFS_PROP_XATTR: if (zfvp->z_flags & ZSB_XATTR) tmp = zfvp->z_xattr; break; case ZFS_PROP_NBMAND: if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) tmp = 1; break; default: vfs_unbusy(vfsp); return (ENOENT); } vfs_unbusy(vfsp); if (tmp != *val) { (void) strcpy(setpoint, "temporary"); *val = tmp; } return (0); } static int zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) { int error = 0; char buf[32]; uint64_t usedobj, quotaobj; uint64_t quota, used = 0; timespec_t now; usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; if (quotaobj == 0 || zfsvfs->z_replay) { error = ENOENT; goto done; } (void) sprintf(buf, "%llx", (longlong_t)id); if ((error = zap_lookup(zfsvfs->z_os, quotaobj, buf, sizeof (quota), 1, "a)) != 0) { dprintf("%s(%d): quotaobj lookup failed\n", __FUNCTION__, __LINE__); goto done; } /* * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". * So we set them to be the same. */ dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used); if (error && error != ENOENT) { dprintf("%s(%d): usedobj failed; %d\n", __FUNCTION__, __LINE__, error); goto done; } dqp->dqb_curblocks = btodb(used); dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; vfs_timestamp(&now); /* * Setting this to 0 causes FreeBSD quota(8) to print * the number of days since the epoch, which isn't * particularly useful. */ dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; done: return (error); } static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) { zfsvfs_t *zfsvfs = vfsp->vfs_data; struct thread *td; int cmd, type, error = 0; int bitsize; zfs_userquota_prop_t quota_type; struct dqblk64 dqblk = { 0 }; td = curthread; cmd = cmds >> SUBCMDSHIFT; type = cmds & SUBCMDMASK; ZFS_ENTER(zfsvfs); if (id == -1) { switch (type) { case USRQUOTA: id = td->td_ucred->cr_ruid; break; case GRPQUOTA: id = td->td_ucred->cr_rgid; break; default: error = EINVAL; if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) vfs_unbusy(vfsp); goto done; } } /* * Map BSD type to: * ZFS_PROP_USERUSED, * ZFS_PROP_USERQUOTA, * ZFS_PROP_GROUPUSED, * ZFS_PROP_GROUPQUOTA */ switch (cmd) { case Q_SETQUOTA: case Q_SETQUOTA32: if (type == USRQUOTA) quota_type = ZFS_PROP_USERQUOTA; else if (type == GRPQUOTA) quota_type = ZFS_PROP_GROUPQUOTA; else error = EINVAL; break; case Q_GETQUOTA: case Q_GETQUOTA32: if (type == USRQUOTA) quota_type = ZFS_PROP_USERUSED; else if (type == GRPQUOTA) quota_type = ZFS_PROP_GROUPUSED; else error = EINVAL; break; } /* * Depending on the cmd, we may need to get * the ruid and domain (see fuidstr_to_sid?), * the fuid (how?), or other information. * Create fuid using zfs_fuid_create(zfsvfs, id, * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)? * I think I can use just the id? * * Look at zfs_id_overquota() to look up a quota. * zap_lookup(something, quotaobj, fuidstring, * sizeof (long long), 1, "a) * * See zfs_set_userquota() to set a quota. */ if ((uint32_t)type >= MAXQUOTAS) { error = EINVAL; goto done; } switch (cmd) { case Q_GETQUOTASIZE: bitsize = 64; error = copyout(&bitsize, arg, sizeof (int)); break; case Q_QUOTAON: // As far as I can tell, you can't turn quotas on or off on zfs error = 0; vfs_unbusy(vfsp); break; case Q_QUOTAOFF: error = ENOTSUP; vfs_unbusy(vfsp); break; case Q_SETQUOTA: error = copyin(arg, &dqblk, sizeof (dqblk)); if (error == 0) error = zfs_set_userquota(zfsvfs, quota_type, "", id, dbtob(dqblk.dqb_bhardlimit)); break; case Q_GETQUOTA: error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk); if (error == 0) error = copyout(&dqblk, arg, sizeof (dqblk)); break; default: error = EINVAL; break; } done: ZFS_EXIT(zfsvfs); return (error); } boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs) { return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)); } /*ARGSUSED*/ static int zfs_sync(vfs_t *vfsp, int waitfor) { /* * Data integrity is job one. We don't want a compromised kernel * writing to the storage pool, so we never sync during panic. */ if (panicstr) return (0); /* * Ignore the system syncher. ZFS already commits async data * at zfs_txg_timeout intervals. */ if (waitfor == MNT_LAZY) return (0); if (vfsp != NULL) { /* * Sync a specific filesystem. */ zfsvfs_t *zfsvfs = vfsp->vfs_data; dsl_pool_t *dp; int error; error = vfs_stdsync(vfsp, waitfor); if (error != 0) return (error); ZFS_ENTER(zfsvfs); dp = dmu_objset_pool(zfsvfs->z_os); /* * If the system is shutting down, then skip any * filesystems which may exist on a suspended pool. */ if (rebooting && spa_suspended(dp->dp_spa)) { ZFS_EXIT(zfsvfs); return (0); } if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, 0); ZFS_EXIT(zfsvfs); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(1M). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); } static void atime_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { zfsvfs->z_atime = TRUE; zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); } else { zfsvfs->z_atime = FALSE; zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); } } static void xattr_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == ZFS_XATTR_OFF) { zfsvfs->z_flags &= ~ZSB_XATTR; } else { zfsvfs->z_flags |= ZSB_XATTR; if (newval == ZFS_XATTR_SA) zfsvfs->z_xattr_sa = B_TRUE; else zfsvfs->z_xattr_sa = B_FALSE; } } static void blksz_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); ASSERT(ISP2(newval)); zfsvfs->z_max_blksz = newval; zfsvfs->z_vfs->mnt_stat.f_iosize = newval; } static void readonly_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval) { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); } else { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); } } static void setuid_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); } } static void exec_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); } } /* * The nbmand mount option can be changed at mount time. * We can't allow it to be toggled on live file systems or incorrect * behavior may be seen from cifs clients * * This property isn't registered via dsl_prop_register(), but this callback * will be called when a file system is first mounted */ static void nbmand_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); } else { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); } } static void snapdir_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_show_ctldir = newval; } static void vscan_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_vscan = newval; } static void acl_mode_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_mode = newval; } static void acl_inherit_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_inherit = newval; } static int zfs_register_callbacks(vfs_t *vfsp) { struct dsl_dataset *ds = NULL; objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; uint64_t nbmand; boolean_t readonly = B_FALSE; boolean_t do_readonly = B_FALSE; boolean_t setuid = B_FALSE; boolean_t do_setuid = B_FALSE; boolean_t exec = B_FALSE; boolean_t do_exec = B_FALSE; boolean_t xattr = B_FALSE; boolean_t atime = B_FALSE; boolean_t do_atime = B_FALSE; boolean_t do_xattr = B_FALSE; int error = 0; ASSERT(vfsp); zfsvfs = vfsp->vfs_data; ASSERT(zfsvfs); os = zfsvfs->z_os; /* * This function can be called for a snapshot when we update snapshot's * mount point, which isn't really supported. */ if (dmu_objset_is_snapshot(os)) return (EOPNOTSUPP); /* * The act of registering our callbacks will destroy any mount * options we may have. In order to enable temporary overrides * of mount options, we stash away the current values and * restore them after we register the callbacks. */ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || !spa_writeable(dmu_objset_spa(os))) { readonly = B_TRUE; do_readonly = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { readonly = B_FALSE; do_readonly = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { setuid = B_TRUE; do_setuid = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { exec = B_FALSE; do_exec = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { exec = B_TRUE; do_exec = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_SA; do_xattr = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { atime = B_FALSE; do_atime = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { atime = B_TRUE; do_atime = B_TRUE; } /* * We need to enter pool configuration here, so that we can use * dsl_prop_get_int_ds() to handle the special nbmand property below. * dsl_prop_get_integer() can not be used, because it has to acquire * spa_namespace_lock and we can not do that because we already hold * z_teardown_lock. The problem is that spa_write_cachefile() is called * with spa_namespace_lock held and the function calls ZFS vnode * operations to write the cache file and thus z_teardown_lock is * acquired after spa_namespace_lock. */ ds = dmu_objset_ds(os); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); /* * nbmand is a special property. It can only be changed at * mount time. * * This is weird, but it is documented to only be changeable * at mount time. */ if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { nbmand = B_FALSE; } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { nbmand = B_TRUE; } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0)) { dsl_pool_config_exit(dmu_objset_pool(os), FTAG); return (error); } /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ error = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; /* * Invoke our callbacks to restore temporary mount options. */ if (do_readonly) readonly_changed_cb(zfsvfs, readonly); if (do_setuid) setuid_changed_cb(zfsvfs, setuid); if (do_exec) exec_changed_cb(zfsvfs, exec); if (do_xattr) xattr_changed_cb(zfsvfs, xattr); if (do_atime) atime_changed_cb(zfsvfs, atime); nbmand_changed_cb(zfsvfs, nbmand); return (0); unregister: dsl_prop_unregister_all(ds, zfsvfs); return (error); } /* * Associate this zfsvfs with the given objset, which must be owned. * This will cache a bunch of on-disk state from the objset in the * zfsvfs. */ static int zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) { int error; uint64_t val; zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; zfsvfs->z_os = os; error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); if (error != 0) return (error); if (zfsvfs->z_version > zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { (void) printf("Can't mount a version %lld file system " "on a version %lld pool\n. Pool must be upgraded to mount " "this file system.", (u_longlong_t)zfsvfs->z_version, (u_longlong_t)spa_version(dmu_objset_spa(os))); return (SET_ERROR(ENOTSUP)); } error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); if (error != 0) return (error); zfsvfs->z_norm = (int)val; error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); if (error != 0) return (error); zfsvfs->z_utf8 = (val != 0); error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); if (error != 0) return (error); zfsvfs->z_case = (uint_t)val; /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || zfsvfs->z_case == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); uint64_t sa_obj = 0; if (zfsvfs->z_use_sa) { /* should either have both of these objects or none */ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0) return (error); } error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); if (error != 0) return (error); if (zfsvfs->z_version >= ZPL_VERSION_SA) sa_register_update_callback(os, zfs_sa_upgrade); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); if (error != 0) return (error); ASSERT(zfsvfs->z_root != 0); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &zfsvfs->z_unlinkedobj); if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 8, 1, &zfsvfs->z_userquota_obj); if (error == ENOENT) zfsvfs->z_userquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 8, 1, &zfsvfs->z_groupquota_obj); if (error == ENOENT) zfsvfs->z_groupquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], 8, 1, &zfsvfs->z_projectquota_obj); if (error == ENOENT) zfsvfs->z_projectquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], 8, 1, &zfsvfs->z_userobjquota_obj); if (error == ENOENT) zfsvfs->z_userobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], 8, 1, &zfsvfs->z_groupobjquota_obj); if (error == ENOENT) zfsvfs->z_groupobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], 8, 1, &zfsvfs->z_projectobjquota_obj); if (error == ENOENT) zfsvfs->z_projectobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); if (error == ENOENT) zfsvfs->z_fuid_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &zfsvfs->z_shares_dir); if (error == ENOENT) zfsvfs->z_shares_dir = 0; else if (error != 0) return (error); /* * Only use the name cache if we are looking for a * name on a file system that does not require normalization * or case folding. We can also look there if we happen to be * on a non-normalizing, mixed sensitivity file system IF we * are looking for the exact name (which is always the case on * FreeBSD). */ zfsvfs->z_use_namecache = !zfsvfs->z_norm || ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); return (0); } taskq_t *zfsvfs_taskq; static void zfsvfs_task_unlinked_drain(void *context, int pending __unused) { zfs_unlinked_drain((zfsvfs_t *)context); } int zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) { objset_t *os; zfsvfs_t *zfsvfs; int error; boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); /* * XXX: Fix struct statfs so this isn't necessary! * * The 'osname' is used as the filesystem's special node, which means * it must fit in statfs.f_mntfromname, or else it can't be * enumerated, so libzfs_mnttab_find() returns NULL, which causes * 'zfs unmount' to think it's not mounted when it is. */ if (strlen(osname) >= MNAMELEN) return (SET_ERROR(ENAMETOOLONG)); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os); if (error != 0) { kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } error = zfsvfs_create_impl(zfvp, zfsvfs, os); return (error); } int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) { int error; zfsvfs->z_vfs = NULL; zfsvfs->z_parent = zfsvfs; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0, zfsvfs_task_unlinked_drain, zfsvfs); #ifdef DIAGNOSTIC rrm_init(&zfsvfs->z_teardown_lock, B_TRUE); #else rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); #endif rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); error = zfsvfs_init(zfsvfs, os); if (error != 0) { dmu_objset_disown(os, B_TRUE, zfsvfs); *zfvp = NULL; kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } *zfvp = zfsvfs; return (0); } static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) { int error; /* * Check for a bad on-disk format version now since we * lied about owning the dataset readonly before. */ if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && dmu_objset_incompatible_encryption_version(zfsvfs->z_os)) return (SET_ERROR(EROFS)); error = zfs_register_callbacks(zfsvfs->z_vfs); if (error) return (error); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); /* * If we are not mounting (ie: online recv), then we don't * have to worry about replaying the log as we blocked all * operations out since we closed the ZIL. */ if (mounting) { boolean_t readonly; ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); /* * During replay we remove the read only flag to * allow replays to succeed. */ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; if (readonly != 0) { zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; } else { dsl_dir_t *dd; zap_stats_t zs; if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, &zs) == 0) { dataset_kstats_update_nunlinks_kstat( &zfsvfs->z_kstat, zs.zs_num_entries); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, "num_entries in unlinked set: %llu", zs.zs_num_entries); } zfs_unlinked_drain(zfsvfs); dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; dd->dd_activity_cancelled = B_FALSE; } /* * Parse and replay the intent log. * * Because of ziltest, this must be done after * zfs_unlinked_drain(). (Further note: ziltest * doesn't use readonly mounts, where * zfs_unlinked_drain() isn't called.) This is because * ziltest causes spa_sync() to think it's committed, * but actually it is not, so the intent log contains * many txg's worth of changes. * * In particular, if object N is in the unlinked set in * the last txg to actually sync, then it could be * actually freed in a later txg and then reallocated * in a yet later txg. This would write a "create * object N" record to the intent log. Normally, this * would be fine because the spa_sync() would have * written out the fact that object N is free, before * we could write the "create object N" intent log * record. * * But when we are in ziltest mode, we advance the "open * txg" without actually spa_sync()-ing the changes to * disk. So we would see that object N is still * allocated and in the unlinked set, and there is an * intent log record saying to allocate it. */ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { if (zil_replay_disable) { zil_destroy(zfsvfs->z_log, B_FALSE); } else { boolean_t use_nc = zfsvfs->z_use_namecache; zfsvfs->z_use_namecache = B_FALSE; zfsvfs->z_replay = B_TRUE; zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); zfsvfs->z_replay = B_FALSE; zfsvfs->z_use_namecache = use_nc; } } /* restore readonly bit */ if (readonly != 0) zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; } /* * Set the objset user_ptr to track its zfsvfs. */ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); return (0); } extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ void zfsvfs_free(zfsvfs_t *zfsvfs) { int i; /* * This is a barrier to prevent the filesystem from going away in * zfs_znode_move() until we can safely ensure that the filesystem is * not unmounted. We consider the filesystem valid before the barrier * and invalid after the barrier. */ rw_enter(&zfsvfs_lock, RW_READER); rw_exit(&zfsvfs_lock); zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); ASSERT(zfsvfs->z_nr_znodes == 0); list_destroy(&zfsvfs->z_all_znodes); rrm_destroy(&zfsvfs->z_teardown_lock); rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_fuid_lock); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); dataset_kstats_destroy(&zfsvfs->z_kstat); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) { zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); if (zfsvfs->z_vfs) { if (zfsvfs->z_use_fuids) { vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); } else { vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); } } zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } static int zfs_domount(vfs_t *vfsp, char *osname) { uint64_t recordsize, fsid_guid; int error = 0; zfsvfs_t *zfsvfs; ASSERT(vfsp); ASSERT(osname); error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs); if (error) return (error); zfsvfs->z_vfs = vfsp; if ((error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL))) goto out; zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; vfsp->vfs_data = zfsvfs; vfsp->mnt_flag |= MNT_LOCAL; vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; /* * This can cause a loss of coherence between ARC and page cache * on ZoF - unclear if the problem is in FreeBSD or ZoF */ vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ vfsp->mnt_kern_flag |= MNTK_NOMSYNC; vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG; #if defined(_KERNEL) && !defined(KMEM_DEBUG) vfsp->mnt_kern_flag |= MNTK_FPLOOKUP; #endif /* * The fsid is 64 bits, composed of an 8-bit fs type, which * separates our fsid from any other filesystem types, and a * 56-bit objset unique ID. The objset unique ID is unique to * all objsets open on this system, provided by unique_create(). * The 8-bit fs type must be put in the low bits of fsid[1] * because that's where other Solaris filesystems put it. */ fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); vfsp->vfs_fsid.val[0] = fsid_guid; vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | (vfsp->mnt_vfc->vfc_typenum & 0xFF); /* * Set features for file system. */ zfs_set_fuid_feature(zfsvfs); if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); } vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if ((error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))) goto out; xattr_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); } else { if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) goto out; } vfs_mountedfrom(vfsp, osname); if (!zfsvfs->z_issnap) zfsctl_create(zfsvfs); out: if (error) { dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } else { atomic_inc_32(&zfs_active_fs_count); } return (error); } static void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) { objset_t *os = zfsvfs->z_os; if (!dmu_objset_is_snapshot(os)) dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); } -#ifdef SECLABEL -/* - * Convert a decimal digit string to a uint64_t integer. - */ static int -str_to_uint64(char *str, uint64_t *objnum) -{ - uint64_t num = 0; - - while (*str) { - if (*str < '0' || *str > '9') - return (SET_ERROR(EINVAL)); - - num = num*10 + *str++ - '0'; - } - - *objnum = num; - return (0); -} - -/* - * The boot path passed from the boot loader is in the form of - * "rootpool-name/root-filesystem-object-number'. Convert this - * string to a dataset name: "rootpool-name/root-filesystem-name". - */ -static int -zfs_parse_bootfs(char *bpath, char *outpath) -{ - char *slashp; - uint64_t objnum; - int error; - - if (*bpath == 0 || *bpath == '/') - return (SET_ERROR(EINVAL)); - - (void) strcpy(outpath, bpath); - - slashp = strchr(bpath, '/'); - - /* if no '/', just return the pool name */ - if (slashp == NULL) { - return (0); - } - - /* if not a number, just return the root dataset name */ - if (str_to_uint64(slashp+1, &objnum)) { - return (0); - } - - *slashp = '\0'; - error = dsl_dsobj_to_dsname(bpath, objnum, outpath); - *slashp = '/'; - - return (error); -} - -/* - * Check that the hex label string is appropriate for the dataset being - * mounted into the global_zone proper. - * - * Return an error if the hex label string is not default or - * admin_low/admin_high. For admin_low labels, the corresponding - * dataset must be readonly. - */ -int -zfs_check_global_label(const char *dsname, const char *hexsl) -{ - if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) - return (0); - if (strcasecmp(hexsl, ADMIN_HIGH) == 0) - return (0); - if (strcasecmp(hexsl, ADMIN_LOW) == 0) { - /* must be readonly */ - uint64_t rdonly; - - if (dsl_prop_get_integer(dsname, - zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) - return (SET_ERROR(EACCES)); - return (rdonly ? 0 : EACCES); - } - return (SET_ERROR(EACCES)); -} - -/* - * Determine whether the mount is allowed according to MAC check. - * by comparing (where appropriate) label of the dataset against - * the label of the zone being mounted into. If the dataset has - * no label, create one. - * - * Returns 0 if access allowed, error otherwise (e.g. EACCES) - */ -static int -zfs_mount_label_policy(vfs_t *vfsp, char *osname) -{ - int error, retv; - zone_t *mntzone = NULL; - ts_label_t *mnt_tsl; - bslabel_t *mnt_sl; - bslabel_t ds_sl; - char ds_hexsl[MAXNAMELEN]; - - retv = EACCES; /* assume the worst */ - - /* - * Start by getting the dataset label if it exists. - */ - error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), - 1, sizeof (ds_hexsl), &ds_hexsl, NULL); - if (error) - return (SET_ERROR(EACCES)); - - /* - * If labeling is NOT enabled, then disallow the mount of datasets - * which have a non-default label already. No other label checks - * are needed. - */ - if (!is_system_labeled()) { - if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) - return (0); - return (SET_ERROR(EACCES)); - } - - /* - * Get the label of the mountpoint. If mounting into the global - * zone (i.e. mountpoint is not within an active zone and the - * zoned property is off), the label must be default or - * admin_low/admin_high only; no other checks are needed. - */ - mntzone = zone_find_by_any_path(vfsp->vfs_mntpt, B_FALSE); - if (mntzone->zone_id == GLOBAL_ZONEID) { - uint64_t zoned; - - zone_rele(mntzone); - - if (dsl_prop_get_integer(osname, - zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) - return (SET_ERROR(EACCES)); - if (!zoned) - return (zfs_check_global_label(osname, ds_hexsl)); - else - /* - * This is the case of a zone dataset being mounted - * initially, before the zone has been fully created; - * allow this mount into global zone. - */ - return (0); - } - - mnt_tsl = mntzone->zone_slabel; - ASSERT(mnt_tsl != NULL); - label_hold(mnt_tsl); - mnt_sl = label2bslabel(mnt_tsl); - - if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { - /* - * The dataset doesn't have a real label, so fabricate one. - */ - char *str = NULL; - - if (l_to_str_internal(mnt_sl, &str) == 0 && - dsl_prop_set_string(osname, - zfs_prop_to_name(ZFS_PROP_MLSLABEL), - ZPROP_SRC_LOCAL, str) == 0) - retv = 0; - if (str != NULL) - kmem_free(str, strlen(str) + 1); - } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { - /* - * Now compare labels to complete the MAC check. If the - * labels are equal then allow access. If the mountpoint - * label dominates the dataset label, allow readonly access. - * Otherwise, access is denied. - */ - if (blequal(mnt_sl, &ds_sl)) - retv = 0; - else if (bldominates(mnt_sl, &ds_sl)) { - vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); - retv = 0; - } - } - - label_rele(mnt_tsl); - zone_rele(mntzone); - return (retv); -} -#endif /* SECLABEL */ - -static int getpoolname(const char *osname, char *poolname) { char *p; p = strchr(osname, '/'); if (p == NULL) { if (strlen(osname) >= MAXNAMELEN) return (ENAMETOOLONG); (void) strcpy(poolname, osname); } else { if (p - osname >= MAXNAMELEN) return (ENAMETOOLONG); (void) strncpy(poolname, osname, p - osname); poolname[p - osname] = '\0'; } return (0); } /*ARGSUSED*/ static int zfs_mount(vfs_t *vfsp) { kthread_t *td = curthread; vnode_t *mvp = vfsp->mnt_vnodecovered; cred_t *cr = td->td_ucred; char *osname; int error = 0; int canwrite; if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) return (SET_ERROR(EINVAL)); /* * If full-owner-access is enabled and delegated administration is * turned on, we must set nosuid. */ if (zfs_super_owner && dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { secpolicy_fs_mount_clearopts(cr, vfsp); } /* * Check for mount privilege? * * If we don't have privilege then see if * we have local permission to allow it */ error = secpolicy_fs_mount(cr, mvp, vfsp); if (error) { if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) goto out; if (!(vfsp->vfs_flag & MS_REMOUNT)) { vattr_t vattr; /* * Make sure user is the owner of the mount point * or has sufficient privileges. */ vattr.va_mask = AT_UID; vn_lock(mvp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(mvp, &vattr, cr)) { VOP_UNLOCK1(mvp); goto out; } if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { VOP_UNLOCK1(mvp); goto out; } VOP_UNLOCK1(mvp); } secpolicy_fs_mount_clearopts(cr, vfsp); } /* * Refuse to mount a filesystem if we are in a local zone and the * dataset is not visible. */ if (!INGLOBALZONE(curproc) && (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { error = SET_ERROR(EPERM); goto out; } - -#ifdef SECLABEL - error = zfs_mount_label_policy(vfsp, osname); - if (error) - goto out; -#endif vfsp->vfs_flag |= MNT_NFS4ACLS; /* * When doing a remount, we simply refresh our temporary properties * according to those options set in the current VFS options. */ if (vfsp->vfs_flag & MS_REMOUNT) { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * Refresh mount options with z_teardown_lock blocking I/O while * the filesystem is in an inconsistent state. * The lock also serializes this code with filesystem * manipulations between entry to zfs_suspend_fs() and return * from zfs_resume_fs(). */ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); zfs_unregister_callbacks(zfsvfs); error = zfs_register_callbacks(vfsp); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); goto out; } /* Initial root mount: try hard to import the requested root pool. */ if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && (vfsp->vfs_flag & MNT_UPDATE) == 0) { char pname[MAXNAMELEN]; error = getpoolname(osname, pname); if (error == 0) error = spa_import_rootpool(pname, false); if (error) goto out; } DROP_GIANT(); error = zfs_domount(vfsp, osname); PICKUP_GIANT(); out: return (error); } static int zfs_statfs(vfs_t *vfsp, struct statfs *statp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; uint64_t refdbytes, availbytes, usedobjs, availobjs; statp->f_version = STATFS_VERSION; ZFS_ENTER(zfsvfs); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); /* * The underlying storage pool actually uses multiple block sizes. * We report the fragsize as the smallest block size we support, * and we report our blocksize as the filesystem's maximum blocksize. */ statp->f_bsize = SPA_MINBLOCKSIZE; statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; /* * The following report "total" blocks of various kinds in the * file system, but reported in terms of f_frsize - the * "fragment" size. */ statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; statp->f_bfree = availbytes / statp->f_bsize; statp->f_bavail = statp->f_bfree; /* no root reservation */ /* * statvfs() should really be called statufs(), because it assumes * static metadata. ZFS doesn't preallocate files, so the best * we can do is report the max that could possibly fit in f_files, * and that minus the number actually used in f_ffree. * For f_ffree, report the smaller of the number of object available * and the number of blocks (each object will take at least a block). */ statp->f_ffree = MIN(availobjs, statp->f_bfree); statp->f_files = statp->f_ffree + usedobjs; /* * We're a zfs filesystem. */ strlcpy(statp->f_fstypename, "zfs", sizeof (statp->f_fstypename)); strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, sizeof (statp->f_mntfromname)); strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, sizeof (statp->f_mntonname)); statp->f_namemax = MAXNAMELEN - 1; ZFS_EXIT(zfsvfs); return (0); } static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *rootzp; int error; ZFS_ENTER(zfsvfs); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *vpp = ZTOV(rootzp); ZFS_EXIT(zfsvfs); if (error == 0) { error = vn_lock(*vpp, flags); if (error != 0) { VN_RELE(*vpp); *vpp = NULL; } } return (error); } /* * Teardown the zfsvfs::z_os. * * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' * and 'z_teardown_inactive_lock' held. */ static int zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) { znode_t *zp; dsl_dir_t *dd; /* * If someone has not already unmounted this file system, * drain the zrele_taskq to ensure all active references to the * zfsvfs_t have been handled only then can it be safely destroyed. */ if (zfsvfs->z_os) { /* * If we're unmounting we have to wait for the list to * drain completely. * * If we're not unmounting there's no guarantee the list * will drain completely, but zreles run from the taskq * may add the parents of dir-based xattrs to the taskq * so we want to wait for these. * * We can safely read z_nr_znodes without locking because the * VFS has already blocked operations which add to the * z_all_znodes list and thus increment z_nr_znodes. */ int round = 0; while (zfsvfs->z_nr_znodes > 0) { taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); if (++round > 1 && !unmounting) break; } } rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); if (!unmounting) { /* * We purge the parent filesystem's vfsp as the parent * filesystem and all of its snapshots have their vnode's * v_vfsp set to the parent's filesystem's vfsp. Note, * 'z_parent' is self referential for non-snapshots. */ #ifdef FREEBSD_NAMECACHE cache_purgevfs(zfsvfs->z_parent->z_vfs, true); #endif } /* * Close the zil. NB: Can't close the zil while zfs_inactive * threads are blocked as zil_close can call zfs_inactive. */ if (zfsvfs->z_log) { zil_close(zfsvfs->z_log); zfsvfs->z_log = NULL; } rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); /* * If we are not unmounting (ie: online recv) and someone already * unmounted this file system while we were doing the switcheroo, * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); return (SET_ERROR(EIO)); } /* * At this point there are no vops active, and any new vops will * fail with EIO since we have z_teardown_lock for writer (only * relevant for forced unmount). * * Release all holds on dbufs. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) if (zp->z_sa_hdl) { ASSERT(ZTOV(zp)->v_count >= 0); zfs_znode_dmu_fini(zp); } mutex_exit(&zfsvfs->z_znodes_lock); /* * If we are unmounting, set the unmounted flag and let new vops * unblock. zfs_inactive will have the unmounted behavior, and all * other vops will fail with EIO. */ if (unmounting) { zfsvfs->z_unmounted = B_TRUE; rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); } /* * z_os will be NULL if there was an error in attempting to reopen * zfsvfs, so just return as the properties had already been * unregistered and cached data had been evicted before. */ if (zfsvfs->z_os == NULL) return (0); /* * Unregister properties. */ zfs_unregister_callbacks(zfsvfs); /* * Evict cached data */ if (!zfs_is_readonly(zfsvfs)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); dmu_objset_evict_dbufs(zfsvfs->z_os); dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; dsl_dir_cancel_waiters(dd); return (0); } /*ARGSUSED*/ static int zfs_umount(vfs_t *vfsp, int fflag) { kthread_t *td = curthread; zfsvfs_t *zfsvfs = vfsp->vfs_data; objset_t *os; cred_t *cr = td->td_ucred; int ret; ret = secpolicy_fs_unmount(cr, vfsp); if (ret) { if (dsl_deleg_access((char *)vfsp->vfs_resource, ZFS_DELEG_PERM_MOUNT, cr)) return (ret); } /* * Unmount any snapshots mounted under .zfs before unmounting the * dataset itself. */ if (zfsvfs->z_ctldir != NULL) { if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) return (ret); } if (fflag & MS_FORCE) { /* * Mark file system as unmounted before calling * vflush(FORCECLOSE). This way we ensure no future vnops * will be called and risk operating on DOOMED vnodes. */ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); zfsvfs->z_unmounted = B_TRUE; rrm_exit(&zfsvfs->z_teardown_lock, FTAG); } /* * Flush all the files. */ ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); if (ret != 0) return (ret); while (taskqueue_cancel(zfsvfs_taskq->tq_queue, &zfsvfs->z_unlinked_drain_task, NULL) != 0) taskqueue_drain(zfsvfs_taskq->tq_queue, &zfsvfs->z_unlinked_drain_task); VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); os = zfsvfs->z_os; /* * z_os will be NULL if there was an error in * attempting to reopen zfsvfs. */ if (os != NULL) { /* * Unset the objset user_ptr. */ mutex_enter(&os->os_user_ptr_lock); dmu_objset_set_user(os, NULL); mutex_exit(&os->os_user_ptr_lock); /* * Finally release the objset */ dmu_objset_disown(os, B_TRUE, zfsvfs); } /* * We can now safely destroy the '.zfs' directory node. */ if (zfsvfs->z_ctldir != NULL) zfsctl_destroy(zfsvfs); zfs_freevfs(vfsp); return (0); } static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; int err; /* * zfs_zget() can't operate on virtual entries like .zfs/ or * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. * This will make NFS to switch to LOOKUP instead of using VGET. */ if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) return (EOPNOTSUPP); ZFS_ENTER(zfsvfs); err = zfs_zget(zfsvfs, ino, &zp); if (err == 0 && zp->z_unlinked) { vrele(ZTOV(zp)); err = EINVAL; } if (err == 0) *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); if (err == 0) { err = vn_lock(*vpp, flags); if (err != 0) vrele(*vpp); } if (err != 0) *vpp = NULL; return (err); } static int #if __FreeBSD_version >= 1300098 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, struct ucred **credanonp, int *numsecflavors, int *secflavors) #else zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors) #endif { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * If this is regular file system vfsp is the same as * zfsvfs->z_parent->z_vfs, but if it is snapshot, * zfsvfs->z_parent->z_vfs represents parent file system * which we have to use here, because only this file system * has mnt_export configured. */ return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, credanonp, numsecflavors, secflavors)); } CTASSERT(SHORT_FID_LEN <= sizeof (struct fid)); CTASSERT(LONG_FID_LEN <= sizeof (struct fid)); static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) { struct componentname cn; zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; vnode_t *dvp; uint64_t object = 0; uint64_t fid_gen = 0; uint64_t gen_mask; uint64_t zp_gen; int i, err; *vpp = NULL; ZFS_ENTER(zfsvfs); /* * On FreeBSD we can get snapshot's mount point or its parent file * system mount point depending if snapshot is already mounted or not. */ if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { zfid_long_t *zlfid = (zfid_long_t *)fidp; uint64_t objsetid = 0; uint64_t setgen = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); ZFS_EXIT(zfsvfs); err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); if (err) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); } if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { zfid_short_t *zfid = (zfid_short_t *)fidp; for (i = 0; i < sizeof (zfid->zf_object); i++) object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); for (i = 0; i < sizeof (zfid->zf_gen); i++) fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * A zero fid_gen means we are in .zfs or the .zfs/snapshot * directory tree. If the object == zfsvfs->z_shares_dir, then * we are in the .zfs/shares directory tree. */ if ((fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { ZFS_EXIT(zfsvfs); VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); if (object == ZFSCTL_INO_SNAPDIR) { cn.cn_nameptr = "snapshot"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN | LOCKLEAF; cn.cn_lkflags = flags; VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); vput(dvp); } else if (object == zfsvfs->z_shares_dir) { /* * XXX This branch must not be taken, * if it is, then the lookup below will * explode. */ cn.cn_nameptr = "shares"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN; cn.cn_lkflags = flags; VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); vput(dvp); } else { *vpp = dvp; } return (err); } gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); if ((err = zfs_zget(zfsvfs, object, &zp))) { ZFS_EXIT(zfsvfs); return (err); } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, sizeof (uint64_t)); zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); vrele(ZTOV(zp)); ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); err = vn_lock(*vpp, flags); if (err == 0) vnode_create_vobject(*vpp, zp->z_size, curthread); else *vpp = NULL; return (err); } /* * Block out VOPs and close zfsvfs_t::z_os * * Note, if successful, then we return with the 'z_teardown_lock' and * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying * dataset and objset intact so that they can be atomically handed off during * a subsequent rollback or recv operation and the resume thereafter. */ int zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); return (0); } /* * Rebuild SA and release VOPs. Note that ownership of the underlying dataset * is an invariant across any of the operations that can be performed while the * filesystem was suspended. Whether it succeeded or failed, the preconditions * are the same: the relevant objset and associated dataset are owned by * zfsvfs, held, and long held on entry. */ int zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { int err; znode_t *zp; ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); /* * We already own this, so just update the objset_t, as the one we * had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_config_exit(dp, FTAG); err = zfsvfs_init(zfsvfs, os); if (err != 0) goto bail; ds->ds_dir->dd_activity_cancelled = B_FALSE; VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); zfs_set_fuid_feature(zfsvfs); /* * Attempt to re-establish all the active znodes with * their dbufs. If a zfs_rezget() fails, then we'll let * any potential callers discover that via ZFS_ENTER_VERIFY_VP * when they try to use their znode. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = list_next(&zfsvfs->z_all_znodes, zp)) { (void) zfs_rezget(zp); } mutex_exit(&zfsvfs->z_znodes_lock); bail: /* release the VOPs */ rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); if (err) { /* * Since we couldn't setup the sa framework, try to force * unmount this file system. */ if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { vfs_ref(zfsvfs->z_vfs); (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); } } return (err); } static void zfs_freevfs(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; zfsvfs_free(zfsvfs); atomic_dec_32(&zfs_active_fs_count); } #ifdef __i386__ static int desiredvnodes_backup; #include #include #include #include #include #endif static void zfs_vnodes_adjust(void) { #ifdef __i386__ int newdesiredvnodes; desiredvnodes_backup = desiredvnodes; /* * We calculate newdesiredvnodes the same way it is done in * vntblinit(). If it is equal to desiredvnodes, it means that * it wasn't tuned by the administrator and we can tune it down. */ newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * vm_kmem_size / (5 * (sizeof (struct vm_object) + sizeof (struct vnode)))); if (newdesiredvnodes == desiredvnodes) desiredvnodes = (3 * newdesiredvnodes) / 4; #endif } static void zfs_vnodes_adjust_back(void) { #ifdef __i386__ desiredvnodes = desiredvnodes_backup; #endif } void zfs_init(void) { printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); /* * Initialize .zfs directory structures */ zfsctl_init(); /* * Initialize znode cache, vnode ops, etc... */ zfs_znode_init(); /* * Reduce number of vnodes. Originally number of vnodes is calculated * with UFS inode in mind. We reduce it here, because it's too big for * ZFS/i386. */ zfs_vnodes_adjust(); dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); } void zfs_fini(void) { taskq_destroy(zfsvfs_taskq); zfsctl_fini(); zfs_znode_fini(); zfs_vnodes_adjust_back(); } int zfs_busy(void) { return (zfs_active_fs_count != 0); } /* * Release VOPs and unmount a suspended filesystem. */ int zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); /* * We already own this, so just hold and rele it to update the * objset_t, as the one we had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_config_exit(dp, FTAG); zfsvfs->z_os = os; /* release the VOPs */ rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); /* * Try to force unmount this file system. */ (void) zfs_umount(zfsvfs->z_vfs, 0); zfsvfs->z_unmounted = B_TRUE; return (0); } int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { int error; objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) return (SET_ERROR(EINVAL)); if (newvers < zfsvfs->z_version) return (SET_ERROR(EINVAL)); if (zfs_spa_version_map(newvers) > spa_version(dmu_objset_spa(zfsvfs->z_os))) return (SET_ERROR(ENOTSUP)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, ZFS_SA_ATTRS); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &newvers, tx); if (error) { dmu_tx_commit(tx); return (error); } if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { uint64_t sa_obj; ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, SPA_VERSION_SA); sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); VERIFY(0 == sa_set_sa_object(os, sa_obj)); sa_register_update_callback(os, zfs_sa_upgrade); } spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, "from %ju to %ju", (uintmax_t)zfsvfs->z_version, (uintmax_t)newvers); dmu_tx_commit(tx); zfsvfs->z_version = newvers; os->os_version = newvers; zfs_set_fuid_feature(zfsvfs); return (0); } /* * Read a property stored within the master node. */ int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) { uint64_t *cached_copy = NULL; /* * Figure out where in the objset_t the cached copy would live, if it * is available for the requested property. */ if (os != NULL) { switch (prop) { case ZFS_PROP_VERSION: cached_copy = &os->os_version; break; case ZFS_PROP_NORMALIZE: cached_copy = &os->os_normalization; break; case ZFS_PROP_UTF8ONLY: cached_copy = &os->os_utf8only; break; case ZFS_PROP_CASE: cached_copy = &os->os_casesensitivity; break; default: break; } } if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { *value = *cached_copy; return (0); } /* * If the property wasn't cached, look up the file system's value for * the property. For the version property, we look up a slightly * different string. */ const char *pname; int error = ENOENT; if (prop == ZFS_PROP_VERSION) { pname = ZPL_VERSION_STR; } else { pname = zfs_prop_to_name(prop); } if (os != NULL) { ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); } if (error == ENOENT) { /* No value set, use the default value */ switch (prop) { case ZFS_PROP_VERSION: *value = ZPL_VERSION; break; case ZFS_PROP_NORMALIZE: case ZFS_PROP_UTF8ONLY: *value = 0; break; case ZFS_PROP_CASE: *value = ZFS_CASE_SENSITIVE; break; default: return (error); } error = 0; } /* * If one of the methods for getting the property value above worked, * copy it into the objset_t's cache. */ if (error == 0 && cached_copy != NULL) { *cached_copy = *value; } return (error); } /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. * If this function returns true we know VFS unmount has been initiated. */ boolean_t zfs_get_vfs_flag_unmounted(objset_t *os) { zfsvfs_t *zfvp; boolean_t unmounted = B_FALSE; ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); mutex_enter(&os->os_user_ptr_lock); zfvp = dmu_objset_get_user(os); if (zfvp != NULL && zfvp->z_vfs != NULL && (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT)) unmounted = B_TRUE; mutex_exit(&os->os_user_ptr_lock); return (unmounted); } #ifdef _KERNEL void zfsvfs_update_fromname(const char *oldname, const char *newname) { char tmpbuf[MAXPATHLEN]; struct mount *mp; char *fromname; size_t oldlen; oldlen = strlen(oldname); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { fromname = mp->mnt_stat.f_mntfromname; if (strcmp(fromname, oldname) == 0) { (void) strlcpy(fromname, newname, sizeof (mp->mnt_stat.f_mntfromname)); continue; } if (strncmp(fromname, oldname, oldlen) == 0 && (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s", newname, fromname + oldlen); (void) strlcpy(fromname, tmpbuf, sizeof (mp->mnt_stat.f_mntfromname)); continue; } } mtx_unlock(&mountlist_mtx); } #endif Index: vendor-sys/openzfs/dist/module/os/freebsd/zfs/zvol_os.c =================================================================== --- vendor-sys/openzfs/dist/module/os/freebsd/zfs/zvol_os.c (revision 365339) +++ vendor-sys/openzfs/dist/module/os/freebsd/zfs/zvol_os.c (revision 365340) @@ -1,1454 +1,1474 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * * Copyright (c) 2006-2010 Pawel Jakub Dawidek * All rights reserved. * * Portions Copyright 2010 Robert Milkowski * * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2011 Martin Matuska */ /* * ZFS volume emulation driver. * * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. * Volumes are accessed through the symbolic links named: * * /dev/zvol// * * Volumes are persistent through reboot. No user command needs to be * run before opening and using a device. * * On FreeBSD ZVOLs are simply GEOM providers like any other storage device * in the system. Except when they're simply character devices (volmode=dev). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #define ZVOL_DUMPSIZE "dumpsize" #ifdef ZVOL_LOCK_DEBUG #define ZVOL_RW_READER RW_WRITER #define ZVOL_RW_READ_HELD RW_WRITE_HELD #else #define ZVOL_RW_READER RW_READER #define ZVOL_RW_READ_HELD RW_READ_HELD #endif enum zvol_geom_state { ZVOL_GEOM_UNINIT, ZVOL_GEOM_STOPPED, ZVOL_GEOM_RUNNING, }; struct zvol_state_os { int zso_volmode; #define zso_dev _zso_state._zso_dev #define zso_geom _zso_state._zso_geom union { /* volmode=dev */ struct zvol_state_dev { struct cdev *zsd_cdev; uint64_t zsd_sync_cnt; } _zso_dev; /* volmode=geom */ struct zvol_state_geom { struct g_provider *zsg_provider; struct bio_queue_head zsg_queue; struct mtx zsg_queue_mtx; enum zvol_geom_state zsg_state; } _zso_geom; } _zso_state; }; static uint32_t zvol_minors; SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0, "Expose as GEOM providers (1), device files (2) or neither"); static boolean_t zpool_on_zvol = B_FALSE; SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, "Allow zpools to use zvols as vdevs (DANGEROUS)"); /* * Toggle unmap functionality. */ boolean_t zvol_unmap_enabled = B_TRUE; SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, &zvol_unmap_enabled, 0, "Enable UNMAP functionality"); /* * zvol maximum transfer in one DMU tx. */ int zvol_maxphys = DMU_MAX_ACCESS / 2; static void zvol_ensure_zilog(zvol_state_t *zv); static d_open_t zvol_cdev_open; static d_close_t zvol_cdev_close; static d_ioctl_t zvol_cdev_ioctl; static d_read_t zvol_cdev_read; static d_write_t zvol_cdev_write; static d_strategy_t zvol_geom_bio_strategy; static struct cdevsw zvol_cdevsw = { .d_name = "zvol", .d_version = D_VERSION, .d_flags = D_DISK | D_TRACKCLOSE, .d_open = zvol_cdev_open, .d_close = zvol_cdev_close, .d_ioctl = zvol_cdev_ioctl, .d_read = zvol_cdev_read, .d_write = zvol_cdev_write, .d_strategy = zvol_geom_bio_strategy, }; extern uint_t zfs_geom_probe_vdev_key; struct g_class zfs_zvol_class = { .name = "ZFS::ZVOL", .version = G_VERSION, }; DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); static int zvol_geom_open(struct g_provider *pp, int flag, int count); static int zvol_geom_close(struct g_provider *pp, int flag, int count); static void zvol_geom_run(zvol_state_t *zv); static void zvol_geom_destroy(zvol_state_t *zv); static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); static void zvol_geom_worker(void *arg); static void zvol_geom_bio_start(struct bio *bp); static int zvol_geom_bio_getattr(struct bio *bp); /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */ /* * GEOM mode implementation */ /*ARGSUSED*/ static int zvol_geom_open(struct g_provider *pp, int flag, int count) { zvol_state_t *zv; int err = 0; boolean_t drop_suspend = B_TRUE; + boolean_t drop_namespace = B_FALSE; if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { /* * if zfs_geom_probe_vdev_key is set, that means that zfs is * attempting to probe geom providers while looking for a * replacement for a missing VDEV. In this case, the * spa_namespace_lock will not be held, but it is still illegal * to use a zvol as a vdev. Deadlocks can result if another * thread has spa_namespace_lock */ return (SET_ERROR(EOPNOTSUPP)); } +retry: rw_enter(&zvol_state_lock, ZVOL_RW_READER); zv = pp->private; if (zv == NULL) { + if (drop_namespace) + mutex_exit(&spa_namespace_lock); rw_exit(&zvol_state_lock); return (SET_ERROR(ENXIO)); } + if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { + /* + * We need to guarantee that the namespace lock is held + * to avoid spurious failures in zvol_first_open + */ + drop_namespace = B_TRUE; + if (!mutex_tryenter(&spa_namespace_lock)) { + rw_exit(&zvol_state_lock); + mutex_enter(&spa_namespace_lock); + goto retry; + } + } mutex_enter(&zv->zv_state_lock); ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); /* * make sure zvol is not suspended during first open * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if (zv->zv_open_count == 0) { ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); err = zvol_first_open(zv, !(flag & FWRITE)); if (err) goto out_mutex; pp->mediasize = zv->zv_volsize; pp->stripeoffset = 0; pp->stripesize = zv->zv_volblocksize; } /* * Check for a bad on-disk format version now since we * lied about owning the dataset readonly before. */ if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || dmu_objset_incompatible_encryption_version(zv->zv_objset))) { err = EROFS; goto out_open_count; } if (zv->zv_flags & ZVOL_EXCL) { err = EBUSY; goto out_open_count; } #ifdef FEXCL if (flag & FEXCL) { if (zv->zv_open_count != 0) { err = EBUSY; goto out_open_count; } zv->zv_flags |= ZVOL_EXCL; } #endif zv->zv_open_count += count; + if (drop_namespace) + mutex_exit(&spa_namespace_lock); mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (0); out_open_count: if (zv->zv_open_count == 0) zvol_last_close(zv); out_mutex: + if (drop_namespace) + mutex_exit(&spa_namespace_lock); mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (SET_ERROR(err)); } /*ARGSUSED*/ static int zvol_geom_close(struct g_provider *pp, int flag, int count) { zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; rw_enter(&zvol_state_lock, ZVOL_RW_READER); zv = pp->private; if (zv == NULL) { rw_exit(&zvol_state_lock); return (SET_ERROR(ENXIO)); } mutex_enter(&zv->zv_state_lock); if (zv->zv_flags & ZVOL_EXCL) { ASSERT(zv->zv_open_count == 1); zv->zv_flags &= ~ZVOL_EXCL; } ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); /* * If the open count is zero, this is a spurious close. * That indicates a bug in the kernel / DDI framework. */ ASSERT(zv->zv_open_count > 0); /* * make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if ((zv->zv_open_count - count) == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); /* * You may get multiple opens, but only one close. */ zv->zv_open_count -= count; if (zv->zv_open_count == 0) { ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (0); } static void zvol_geom_run(zvol_state_t *zv) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); g_error_provider(pp, 0); kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0, "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER)); } static void zvol_geom_destroy(zvol_state_t *zv) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); g_topology_assert(); mutex_enter(&zv->zv_state_lock); VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING); mutex_exit(&zv->zv_state_lock); zsg->zsg_provider = NULL; pp->private = NULL; g_wither_geom(pp->geom, ENXIO); } static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) { int count, error, flags; g_topology_assert(); /* * To make it easier we expect either open or close, but not both * at the same time. */ KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || (acr <= 0 && acw <= 0 && ace <= 0), ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", pp->name, acr, acw, ace)); if (pp->private == NULL) { if (acr <= 0 && acw <= 0 && ace <= 0) return (0); return (pp->error); } /* * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if * ace != 0, because GEOM already handles that and handles it a bit * differently. GEOM allows for multiple read/exclusive consumers and * ZFS allows only one exclusive consumer, no matter if it is reader or * writer. I like better the way GEOM works so I'll leave it for GEOM * to decide what to do. */ count = acr + acw + ace; if (count == 0) return (0); flags = 0; if (acr != 0 || ace != 0) flags |= FREAD; if (acw != 0) flags |= FWRITE; g_topology_unlock(); if (count > 0) error = zvol_geom_open(pp, flags, count); else error = zvol_geom_close(pp, flags, -count); g_topology_lock(); return (error); } static void zvol_geom_worker(void *arg) { zvol_state_t *zv = arg; struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct bio *bp; ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); for (;;) { mtx_lock(&zsg->zsg_queue_mtx); bp = bioq_takefirst(&zsg->zsg_queue); if (bp == NULL) { if (zsg->zsg_state == ZVOL_GEOM_STOPPED) { zsg->zsg_state = ZVOL_GEOM_RUNNING; wakeup(&zsg->zsg_state); mtx_unlock(&zsg->zsg_queue_mtx); kthread_exit(); } msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx, PRIBIO | PDROP, "zvol:io", 0); continue; } mtx_unlock(&zsg->zsg_queue_mtx); zvol_geom_bio_strategy(bp); } } static void zvol_geom_bio_start(struct bio *bp) { zvol_state_t *zv = bp->bio_to->private; struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; boolean_t first; if (bp->bio_cmd == BIO_GETATTR) { if (zvol_geom_bio_getattr(bp)) g_io_deliver(bp, EOPNOTSUPP); return; } if (!THREAD_CAN_SLEEP()) { mtx_lock(&zsg->zsg_queue_mtx); first = (bioq_first(&zsg->zsg_queue) == NULL); bioq_insert_tail(&zsg->zsg_queue, bp); mtx_unlock(&zsg->zsg_queue_mtx); if (first) wakeup_one(&zsg->zsg_queue); return; } zvol_geom_bio_strategy(bp); } static int zvol_geom_bio_getattr(struct bio *bp) { zvol_state_t *zv; zv = bp->bio_to->private; ASSERT(zv != NULL); spa_t *spa = dmu_objset_spa(zv->zv_objset); uint64_t refd, avail, usedobjs, availobjs; if (g_handleattr_int(bp, "GEOM::candelete", 1)) return (0); if (strcmp(bp->bio_attribute, "blocksavail") == 0) { dmu_objset_space(zv->zv_objset, &refd, &avail, &usedobjs, &availobjs); if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE)) return (0); } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { dmu_objset_space(zv->zv_objset, &refd, &avail, &usedobjs, &availobjs); if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE)) return (0); } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { avail = metaslab_class_get_space(spa_normal_class(spa)); avail -= metaslab_class_get_alloc(spa_normal_class(spa)); if (g_handleattr_off_t(bp, "poolblocksavail", avail / DEV_BSIZE)) return (0); } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { refd = metaslab_class_get_alloc(spa_normal_class(spa)); if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE)) return (0); } return (1); } static void zvol_geom_bio_strategy(struct bio *bp) { zvol_state_t *zv; uint64_t off, volsize; size_t resid; char *addr; objset_t *os; zfs_locked_range_t *lr; int error = 0; boolean_t doread = B_FALSE; boolean_t is_dumpified; boolean_t sync; if (bp->bio_to) zv = bp->bio_to->private; else zv = bp->bio_dev->si_drv2; if (zv == NULL) { error = SET_ERROR(ENXIO); goto out; } rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); switch (bp->bio_cmd) { case BIO_READ: doread = B_TRUE; break; case BIO_WRITE: case BIO_FLUSH: case BIO_DELETE: if (zv->zv_flags & ZVOL_RDONLY) { error = SET_ERROR(EROFS); goto resume; } zvol_ensure_zilog(zv); if (bp->bio_cmd == BIO_FLUSH) goto sync; break; default: error = EOPNOTSUPP; goto resume; } off = bp->bio_offset; volsize = zv->zv_volsize; os = zv->zv_objset; ASSERT(os != NULL); addr = bp->bio_data; resid = bp->bio_length; - if (resid > 0 && (off < 0 || off >= volsize)) { + if (resid > 0 && off >= volsize) { error = SET_ERROR(EIO); goto resume; } is_dumpified = B_FALSE; sync = !doread && !is_dumpified && zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; /* * There must be no buffer changes when doing a dmu_sync() because * we can't change the data whilst calculating the checksum. */ lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid, doread ? RL_READER : RL_WRITER); if (bp->bio_cmd == BIO_DELETE) { dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { zvol_log_truncate(zv, tx, off, resid, sync); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, off, resid); resid = 0; } goto unlock; } while (resid != 0 && off < volsize) { size_t size = MIN(resid, zvol_maxphys); if (doread) { error = dmu_read(os, ZVOL_OBJ, off, size, addr, DMU_READ_PREFETCH); } else { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { dmu_write(os, ZVOL_OBJ, off, size, addr, tx); zvol_log_write(zv, tx, off, size, sync); dmu_tx_commit(tx); } } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } off += size; addr += size; resid -= size; } unlock: zfs_rangelock_exit(lr); bp->bio_completed = bp->bio_length - resid; if (bp->bio_completed < bp->bio_length && off > volsize) error = EINVAL; switch (bp->bio_cmd) { case BIO_FLUSH: break; case BIO_READ: dataset_kstats_update_read_kstats(&zv->zv_kstat, bp->bio_completed); break; case BIO_WRITE: dataset_kstats_update_write_kstats(&zv->zv_kstat, bp->bio_completed); break; case BIO_DELETE: break; default: break; } if (sync) { sync: zil_commit(zv->zv_zilog, ZVOL_OBJ); } resume: rw_exit(&zv->zv_suspend_lock); out: if (bp->bio_to) g_io_deliver(bp, error); else biofinish(bp, NULL, error); } /* * Character device mode implementation */ static int zvol_cdev_read(struct cdev *dev, struct uio *uio, int ioflag) { zvol_state_t *zv; uint64_t volsize; zfs_locked_range_t *lr; int error = 0; zv = dev->si_drv2; volsize = zv->zv_volsize; /* * uio_loffset == volsize isn't an error as * its required for EOF processing. */ if (uio->uio_resid > 0 && (uio->uio_loffset < 0 || uio->uio_loffset > volsize)) return (SET_ERROR(EIO)); lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset, uio->uio_resid, RL_READER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); /* don't read past the end */ if (bytes > volsize - uio->uio_loffset) bytes = volsize - uio->uio_loffset; error = dmu_read_uio_dnode(zv->zv_dn, uio, bytes); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } } zfs_rangelock_exit(lr); return (error); } static int zvol_cdev_write(struct cdev *dev, struct uio *uio, int ioflag) { zvol_state_t *zv; uint64_t volsize; zfs_locked_range_t *lr; int error = 0; boolean_t sync; zv = dev->si_drv2; volsize = zv->zv_volsize; if (uio->uio_resid > 0 && (uio->uio_loffset < 0 || uio->uio_loffset > volsize)) return (SET_ERROR(EIO)); sync = (ioflag & IO_SYNC) || (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); zvol_ensure_zilog(zv); lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset, uio->uio_resid, RL_WRITER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); uint64_t off = uio->uio_loffset; dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); break; } error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx); if (error == 0) zvol_log_write(zv, tx, off, bytes, sync); dmu_tx_commit(tx); if (error) break; } zfs_rangelock_exit(lr); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); return (error); } static int zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) { zvol_state_t *zv; struct zvol_state_dev *zsd; int err = 0; boolean_t drop_suspend = B_TRUE; rw_enter(&zvol_state_lock, ZVOL_RW_READER); zv = dev->si_drv2; if (zv == NULL) { rw_exit(&zvol_state_lock); return (SET_ERROR(ENXIO)); } mutex_enter(&zv->zv_state_lock); ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV); /* * make sure zvol is not suspended during first open * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if (zv->zv_open_count == 0) { ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); err = zvol_first_open(zv, !(flags & FWRITE)); if (err) goto out_locked; } if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { err = EROFS; goto out_opened; } if (zv->zv_flags & ZVOL_EXCL) { err = EBUSY; goto out_opened; } #ifdef FEXCL if (flags & FEXCL) { if (zv->zv_open_count != 0) { err = EBUSY; goto out_opened; } zv->zv_flags |= ZVOL_EXCL; } #endif zv->zv_open_count++; if (flags & (FSYNC | FDSYNC)) { zsd = &zv->zv_zso->zso_dev; zsd->zsd_sync_cnt++; if (zsd->zsd_sync_cnt == 1) zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ); } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (0); out_opened: if (zv->zv_open_count == 0) zvol_last_close(zv); out_locked: mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (SET_ERROR(err)); } static int zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) { zvol_state_t *zv; struct zvol_state_dev *zsd; boolean_t drop_suspend = B_TRUE; rw_enter(&zvol_state_lock, ZVOL_RW_READER); zv = dev->si_drv2; if (zv == NULL) { rw_exit(&zvol_state_lock); return (SET_ERROR(ENXIO)); } mutex_enter(&zv->zv_state_lock); if (zv->zv_flags & ZVOL_EXCL) { ASSERT(zv->zv_open_count == 1); zv->zv_flags &= ~ZVOL_EXCL; } ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV); /* * If the open count is zero, this is a spurious close. * That indicates a bug in the kernel / DDI framework. */ ASSERT(zv->zv_open_count > 0); /* * make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 1) { if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); /* * You may get multiple opens, but only one close. */ zv->zv_open_count--; if (flags & (FSYNC | FDSYNC)) { zsd = &zv->zv_zso->zso_dev; zsd->zsd_sync_cnt--; } if (zv->zv_open_count == 0) { ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (0); } static int zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, int fflag, struct thread *td) { zvol_state_t *zv; zfs_locked_range_t *lr; off_t offset, length; int i, error; boolean_t sync; zv = dev->si_drv2; error = 0; KASSERT(zv->zv_open_count > 0, ("Device with zero access count in %s", __func__)); i = IOCPARM_LEN(cmd); switch (cmd) { case DIOCGSECTORSIZE: *(uint32_t *)data = DEV_BSIZE; break; case DIOCGMEDIASIZE: *(off_t *)data = zv->zv_volsize; break; case DIOCGFLUSH: rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); if (zv->zv_zilog != NULL) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); break; case DIOCGDELETE: if (!zvol_unmap_enabled) break; offset = ((off_t *)data)[0]; length = ((off_t *)data)[1]; if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || offset < 0 || offset >= zv->zv_volsize || length <= 0) { printf("%s: offset=%jd length=%jd\n", __func__, offset, length); error = EINVAL; break; } rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); zvol_ensure_zilog(zv); lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length, RL_WRITER); dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { sync = FALSE; dmu_tx_abort(tx); } else { sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); zvol_log_truncate(zv, tx, offset, length, sync); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length); } zfs_rangelock_exit(lr); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); break; case DIOCGSTRIPESIZE: *(off_t *)data = zv->zv_volblocksize; break; case DIOCGSTRIPEOFFSET: *(off_t *)data = 0; break; case DIOCGATTR: { spa_t *spa = dmu_objset_spa(zv->zv_objset); struct diocgattr_arg *arg = (struct diocgattr_arg *)data; uint64_t refd, avail, usedobjs, availobjs; if (strcmp(arg->name, "GEOM::candelete") == 0) arg->value.i = 1; else if (strcmp(arg->name, "blocksavail") == 0) { dmu_objset_space(zv->zv_objset, &refd, &avail, &usedobjs, &availobjs); arg->value.off = avail / DEV_BSIZE; } else if (strcmp(arg->name, "blocksused") == 0) { dmu_objset_space(zv->zv_objset, &refd, &avail, &usedobjs, &availobjs); arg->value.off = refd / DEV_BSIZE; } else if (strcmp(arg->name, "poolblocksavail") == 0) { avail = metaslab_class_get_space(spa_normal_class(spa)); avail -= metaslab_class_get_alloc( spa_normal_class(spa)); arg->value.off = avail / DEV_BSIZE; } else if (strcmp(arg->name, "poolblocksused") == 0) { refd = metaslab_class_get_alloc(spa_normal_class(spa)); arg->value.off = refd / DEV_BSIZE; } else error = ENOIOCTL; break; } case FIOSEEKHOLE: case FIOSEEKDATA: { off_t *off = (off_t *)data; uint64_t noff; boolean_t hole; hole = (cmd == FIOSEEKHOLE); noff = *off; error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); *off = noff; break; } default: error = ENOIOCTL; } return (error); } /* * Misc. helpers */ static void zvol_ensure_zilog(zvol_state_t *zv) { ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); /* * Open a ZIL if this is the first time we have written to this * zvol. We protect zv->zv_zilog with zv_suspend_lock rather * than zv_state_lock so that we don't need to acquire an * additional lock in this path. */ if (zv->zv_zilog == NULL) { if (!rw_tryupgrade(&zv->zv_suspend_lock)) { rw_exit(&zv->zv_suspend_lock); rw_enter(&zv->zv_suspend_lock, RW_WRITER); } if (zv->zv_zilog == NULL) { zv->zv_zilog = zil_open(zv->zv_objset, zvol_get_data); zv->zv_flags |= ZVOL_WRITTEN_TO; } rw_downgrade(&zv->zv_suspend_lock); } } static boolean_t zvol_is_zvol_impl(const char *device) { return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); } static void zvol_rename_minor(zvol_state_t *zv, const char *newname) { ASSERT(RW_LOCK_HELD(&zvol_state_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); /* move to new hashtable entry */ zv->zv_hash = zvol_name_hash(zv->zv_name); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; struct g_geom *gp; g_topology_lock(); gp = pp->geom; ASSERT(gp != NULL); zsg->zsg_provider = NULL; g_wither_provider(pp, ENXIO); pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; pp->sectorsize = DEV_BSIZE; pp->mediasize = zv->zv_volsize; pp->private = zv; zsg->zsg_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev; struct make_dev_args args; dev = zsd->zsd_cdev; if (dev != NULL) { destroy_dev(dev); dev = zsd->zsd_cdev = NULL; if (zv->zv_open_count > 0) { zv->zv_flags &= ~ZVOL_EXCL; zv->zv_open_count = 0; /* XXX need suspend lock but lock order */ zvol_last_close(zv); } } make_dev_args_init(&args); args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; args.mda_devsw = &zvol_cdevsw; args.mda_cr = NULL; args.mda_uid = UID_ROOT; args.mda_gid = GID_OPERATOR; args.mda_mode = 0640; args.mda_si_drv2 = zv; if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) == 0) { dev->si_iosize_max = MAXPHYS; zsd->zsd_cdev = dev; } } strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); } /* * Remove minor node for the specified volume. */ static void zvol_free(zvol_state_t *zv) { ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); ASSERT(zv->zv_open_count == 0); ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); rw_destroy(&zv->zv_suspend_lock); zfs_rangelock_fini(&zv->zv_rangelock); if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; g_topology_lock(); zvol_geom_destroy(zv); g_topology_unlock(); mtx_destroy(&zsg->zsg_queue_mtx); } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev = zsd->zsd_cdev; if (dev != NULL) destroy_dev(dev); } mutex_destroy(&zv->zv_state_lock); dataset_kstats_destroy(&zv->zv_kstat); kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); zvol_minors--; } /* * Create a minor node (plus a whole lot more) for the specified volume. */ static int zvol_create_minor_impl(const char *name) { zvol_state_t *zv; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; uint64_t volmode, hash; int error; ZFS_LOG(1, "Creating ZVOL %s...", name); hash = zvol_name_hash(name); if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { ASSERT(MUTEX_HELD(&zv->zv_state_lock)); mutex_exit(&zv->zv_state_lock); return (SET_ERROR(EEXIST)); } DROP_GIANT(); /* lie and say we're read-only */ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); if (error) goto out_doi; error = dmu_object_info(os, ZVOL_OBJ, doi); if (error) goto out_dmu_objset_disown; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) goto out_dmu_objset_disown; error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); if (error != 0 || volmode == ZFS_VOLMODE_DEFAULT) volmode = zvol_volmode; /* * zvol_alloc equivalent ... */ zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); zv->zv_hash = hash; mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zv->zv_zso->zso_volmode = volmode; if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp; struct g_geom *gp; zsg->zsg_state = ZVOL_GEOM_UNINIT; mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF); g_topology_lock(); gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); gp->start = zvol_geom_bio_start; gp->access = zvol_geom_access; pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); /* TODO: NULL check? */ pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; pp->sectorsize = DEV_BSIZE; pp->mediasize = 0; pp->private = zv; zsg->zsg_provider = pp; bioq_init(&zsg->zsg_queue); } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev; struct make_dev_args args; make_dev_args_init(&args); args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; args.mda_devsw = &zvol_cdevsw; args.mda_cr = NULL; args.mda_uid = UID_ROOT; args.mda_gid = GID_OPERATOR; args.mda_mode = 0640; args.mda_si_drv2 = zv; error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name); if (error != 0) { mutex_destroy(&zv->zv_state_lock); kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (*zv)); dmu_objset_disown(os, B_TRUE, FTAG); goto out_giant; } dev->si_iosize_max = MAXPHYS; zsd->zsd_cdev = dev; } (void) strlcpy(zv->zv_name, name, MAXPATHLEN); rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) zv->zv_flags |= ZVOL_RDONLY; zv->zv_volblocksize = doi->doi_data_block_size; zv->zv_volsize = volsize; zv->zv_objset = os; if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) zil_destroy(dmu_objset_zil(os), B_FALSE); else zil_replay(os, zv, zvol_replay_vector); } ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); /* XXX do prefetch */ zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, B_TRUE, FTAG); if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { if (error == 0) zvol_geom_run(zv); g_topology_unlock(); } out_doi: kmem_free(doi, sizeof (dmu_object_info_t)); if (error == 0) { rw_enter(&zvol_state_lock, RW_WRITER); zvol_insert(zv); zvol_minors++; rw_exit(&zvol_state_lock); } ZFS_LOG(1, "ZVOL %s created.", name); out_giant: PICKUP_GIANT(); return (error); } static void zvol_clear_private(zvol_state_t *zv) { ASSERT(RW_LOCK_HELD(&zvol_state_lock)); if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; if (pp == NULL) /* XXX when? */ return; mtx_lock(&zsg->zsg_queue_mtx); zsg->zsg_state = ZVOL_GEOM_STOPPED; pp->private = NULL; wakeup_one(&zsg->zsg_queue); while (zsg->zsg_state != ZVOL_GEOM_RUNNING) msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx, 0, "zvol:w", 0); mtx_unlock(&zsg->zsg_queue_mtx); ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); } } static int zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) { zv->zv_volsize = volsize; if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; if (pp == NULL) /* XXX when? */ return (0); g_topology_lock(); /* * Do not invoke resize event when initial size was zero. * ZVOL initializes the size on first open, this is not * real resizing. */ if (pp->mediasize == 0) pp->mediasize = zv->zv_volsize; else g_resize_provider(pp, zv->zv_volsize); g_topology_unlock(); } return (0); } static void zvol_set_disk_ro_impl(zvol_state_t *zv, int flags) { // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags); } static void zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity) { // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity); } const static zvol_platform_ops_t zvol_freebsd_ops = { .zv_free = zvol_free, .zv_rename_minor = zvol_rename_minor, .zv_create_minor = zvol_create_minor_impl, .zv_update_volsize = zvol_update_volsize, .zv_clear_private = zvol_clear_private, .zv_is_zvol = zvol_is_zvol_impl, .zv_set_disk_ro = zvol_set_disk_ro_impl, .zv_set_capacity = zvol_set_capacity_impl, }; /* * Public interfaces */ int zvol_busy(void) { return (zvol_minors != 0); } int zvol_init(void) { zvol_init_impl(); zvol_register_ops(&zvol_freebsd_ops); return (0); } void zvol_fini(void) { zvol_fini_impl(); } Index: vendor-sys/openzfs/dist/module/os/linux/spl/spl-kstat.c =================================================================== --- vendor-sys/openzfs/dist/module/os/linux/spl/spl-kstat.c (revision 365339) +++ vendor-sys/openzfs/dist/module/os/linux/spl/spl-kstat.c (revision 365340) @@ -1,778 +1,782 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . * * Solaris Porting Layer (SPL) Kstat Implementation. + * + * Links to Illumos.org for more information on kstat function: + * [1] https://illumos.org/man/1M/kstat + * [2] https://illumos.org/man/9f/kstat_create */ #include #include #include #include #include static kmutex_t kstat_module_lock; static struct list_head kstat_module_list; static kid_t kstat_id; static int kstat_resize_raw(kstat_t *ksp) { if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX) return (ENOMEM); vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX); ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); return (0); } void kstat_waitq_enter(kstat_io_t *kiop) { hrtime_t new, delta; ulong_t wcnt; new = gethrtime(); delta = new - kiop->wlastupdate; kiop->wlastupdate = new; wcnt = kiop->wcnt++; if (wcnt != 0) { kiop->wlentime += delta * wcnt; kiop->wtime += delta; } } EXPORT_SYMBOL(kstat_waitq_enter); void kstat_waitq_exit(kstat_io_t *kiop) { hrtime_t new, delta; ulong_t wcnt; new = gethrtime(); delta = new - kiop->wlastupdate; kiop->wlastupdate = new; wcnt = kiop->wcnt--; ASSERT((int)wcnt > 0); kiop->wlentime += delta * wcnt; kiop->wtime += delta; } EXPORT_SYMBOL(kstat_waitq_exit); void kstat_runq_enter(kstat_io_t *kiop) { hrtime_t new, delta; ulong_t rcnt; new = gethrtime(); delta = new - kiop->rlastupdate; kiop->rlastupdate = new; rcnt = kiop->rcnt++; if (rcnt != 0) { kiop->rlentime += delta * rcnt; kiop->rtime += delta; } } EXPORT_SYMBOL(kstat_runq_enter); void kstat_runq_exit(kstat_io_t *kiop) { hrtime_t new, delta; ulong_t rcnt; new = gethrtime(); delta = new - kiop->rlastupdate; kiop->rlastupdate = new; rcnt = kiop->rcnt--; ASSERT((int)rcnt > 0); kiop->rlentime += delta * rcnt; kiop->rtime += delta; } EXPORT_SYMBOL(kstat_runq_exit); static int kstat_seq_show_headers(struct seq_file *f) { kstat_t *ksp = (kstat_t *)f->private; int rc = 0; ASSERT(ksp->ks_magic == KS_MAGIC); seq_printf(f, "%d %d 0x%02x %d %d %lld %lld\n", ksp->ks_kid, ksp->ks_type, ksp->ks_flags, ksp->ks_ndata, (int)ksp->ks_data_size, ksp->ks_crtime, ksp->ks_snaptime); switch (ksp->ks_type) { case KSTAT_TYPE_RAW: restart: if (ksp->ks_raw_ops.headers) { rc = ksp->ks_raw_ops.headers( ksp->ks_raw_buf, ksp->ks_raw_bufsize); if (rc == ENOMEM && !kstat_resize_raw(ksp)) goto restart; if (!rc) seq_puts(f, ksp->ks_raw_buf); } else { seq_printf(f, "raw data\n"); } break; case KSTAT_TYPE_NAMED: seq_printf(f, "%-31s %-4s %s\n", "name", "type", "data"); break; case KSTAT_TYPE_INTR: seq_printf(f, "%-8s %-8s %-8s %-8s %-8s\n", "hard", "soft", "watchdog", "spurious", "multsvc"); break; case KSTAT_TYPE_IO: seq_printf(f, "%-8s %-8s %-8s %-8s %-8s %-8s " "%-8s %-8s %-8s %-8s %-8s %-8s\n", "nread", "nwritten", "reads", "writes", "wtime", "wlentime", "wupdate", "rtime", "rlentime", "rupdate", "wcnt", "rcnt"); break; case KSTAT_TYPE_TIMER: seq_printf(f, "%-31s %-8s " "%-8s %-8s %-8s %-8s %-8s\n", "name", "events", "elapsed", "min", "max", "start", "stop"); break; default: PANIC("Undefined kstat type %d\n", ksp->ks_type); } return (-rc); } static int kstat_seq_show_raw(struct seq_file *f, unsigned char *p, int l) { int i, j; for (i = 0; ; i++) { seq_printf(f, "%03x:", i); for (j = 0; j < 16; j++) { if (i * 16 + j >= l) { seq_printf(f, "\n"); goto out; } seq_printf(f, " %02x", (unsigned char)p[i * 16 + j]); } seq_printf(f, "\n"); } out: return (0); } static int kstat_seq_show_named(struct seq_file *f, kstat_named_t *knp) { seq_printf(f, "%-31s %-4d ", knp->name, knp->data_type); switch (knp->data_type) { case KSTAT_DATA_CHAR: knp->value.c[15] = '\0'; /* NULL terminate */ seq_printf(f, "%-16s", knp->value.c); break; /* * NOTE - We need to be more careful able what tokens are * used for each arch, for now this is correct for x86_64. */ case KSTAT_DATA_INT32: seq_printf(f, "%d", knp->value.i32); break; case KSTAT_DATA_UINT32: seq_printf(f, "%u", knp->value.ui32); break; case KSTAT_DATA_INT64: seq_printf(f, "%lld", (signed long long)knp->value.i64); break; case KSTAT_DATA_UINT64: seq_printf(f, "%llu", (unsigned long long)knp->value.ui64); break; case KSTAT_DATA_LONG: seq_printf(f, "%ld", knp->value.l); break; case KSTAT_DATA_ULONG: seq_printf(f, "%lu", knp->value.ul); break; case KSTAT_DATA_STRING: KSTAT_NAMED_STR_PTR(knp) [KSTAT_NAMED_STR_BUFLEN(knp)-1] = '\0'; seq_printf(f, "%s", KSTAT_NAMED_STR_PTR(knp)); break; default: PANIC("Undefined kstat data type %d\n", knp->data_type); } seq_printf(f, "\n"); return (0); } static int kstat_seq_show_intr(struct seq_file *f, kstat_intr_t *kip) { seq_printf(f, "%-8u %-8u %-8u %-8u %-8u\n", kip->intrs[KSTAT_INTR_HARD], kip->intrs[KSTAT_INTR_SOFT], kip->intrs[KSTAT_INTR_WATCHDOG], kip->intrs[KSTAT_INTR_SPURIOUS], kip->intrs[KSTAT_INTR_MULTSVC]); return (0); } static int kstat_seq_show_io(struct seq_file *f, kstat_io_t *kip) { /* though wlentime & friends are signed, they will never be negative */ seq_printf(f, "%-8llu %-8llu %-8u %-8u %-8llu %-8llu " "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n", kip->nread, kip->nwritten, kip->reads, kip->writes, kip->wtime, kip->wlentime, kip->wlastupdate, kip->rtime, kip->rlentime, kip->rlastupdate, kip->wcnt, kip->rcnt); return (0); } static int kstat_seq_show_timer(struct seq_file *f, kstat_timer_t *ktp) { seq_printf(f, "%-31s %-8llu %-8llu %-8llu %-8llu %-8llu %-8llu\n", ktp->name, ktp->num_events, ktp->elapsed_time, ktp->min_time, ktp->max_time, ktp->start_time, ktp->stop_time); return (0); } static int kstat_seq_show(struct seq_file *f, void *p) { kstat_t *ksp = (kstat_t *)f->private; int rc = 0; ASSERT(ksp->ks_magic == KS_MAGIC); switch (ksp->ks_type) { case KSTAT_TYPE_RAW: restart: if (ksp->ks_raw_ops.data) { rc = ksp->ks_raw_ops.data( ksp->ks_raw_buf, ksp->ks_raw_bufsize, p); if (rc == ENOMEM && !kstat_resize_raw(ksp)) goto restart; if (!rc) seq_puts(f, ksp->ks_raw_buf); } else { ASSERT(ksp->ks_ndata == 1); rc = kstat_seq_show_raw(f, ksp->ks_data, ksp->ks_data_size); } break; case KSTAT_TYPE_NAMED: rc = kstat_seq_show_named(f, (kstat_named_t *)p); break; case KSTAT_TYPE_INTR: rc = kstat_seq_show_intr(f, (kstat_intr_t *)p); break; case KSTAT_TYPE_IO: rc = kstat_seq_show_io(f, (kstat_io_t *)p); break; case KSTAT_TYPE_TIMER: rc = kstat_seq_show_timer(f, (kstat_timer_t *)p); break; default: PANIC("Undefined kstat type %d\n", ksp->ks_type); } return (-rc); } static int kstat_default_update(kstat_t *ksp, int rw) { ASSERT(ksp != NULL); if (rw == KSTAT_WRITE) return (EACCES); return (0); } static void * kstat_seq_data_addr(kstat_t *ksp, loff_t n) { void *rc = NULL; switch (ksp->ks_type) { case KSTAT_TYPE_RAW: if (ksp->ks_raw_ops.addr) rc = ksp->ks_raw_ops.addr(ksp, n); else rc = ksp->ks_data; break; case KSTAT_TYPE_NAMED: rc = ksp->ks_data + n * sizeof (kstat_named_t); break; case KSTAT_TYPE_INTR: rc = ksp->ks_data + n * sizeof (kstat_intr_t); break; case KSTAT_TYPE_IO: rc = ksp->ks_data + n * sizeof (kstat_io_t); break; case KSTAT_TYPE_TIMER: rc = ksp->ks_data + n * sizeof (kstat_timer_t); break; default: PANIC("Undefined kstat type %d\n", ksp->ks_type); } return (rc); } static void * kstat_seq_start(struct seq_file *f, loff_t *pos) { loff_t n = *pos; kstat_t *ksp = (kstat_t *)f->private; ASSERT(ksp->ks_magic == KS_MAGIC); mutex_enter(ksp->ks_lock); if (ksp->ks_type == KSTAT_TYPE_RAW) { ksp->ks_raw_bufsize = PAGE_SIZE; ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); } /* Dynamically update kstat, on error existing kstats are used */ (void) ksp->ks_update(ksp, KSTAT_READ); ksp->ks_snaptime = gethrtime(); if (!(ksp->ks_flags & KSTAT_FLAG_NO_HEADERS) && !n && kstat_seq_show_headers(f)) return (NULL); if (n >= ksp->ks_ndata) return (NULL); return (kstat_seq_data_addr(ksp, n)); } static void * kstat_seq_next(struct seq_file *f, void *p, loff_t *pos) { kstat_t *ksp = (kstat_t *)f->private; ASSERT(ksp->ks_magic == KS_MAGIC); ++*pos; if (*pos >= ksp->ks_ndata) return (NULL); return (kstat_seq_data_addr(ksp, *pos)); } static void kstat_seq_stop(struct seq_file *f, void *v) { kstat_t *ksp = (kstat_t *)f->private; ASSERT(ksp->ks_magic == KS_MAGIC); if (ksp->ks_type == KSTAT_TYPE_RAW) vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); mutex_exit(ksp->ks_lock); } static struct seq_operations kstat_seq_ops = { .show = kstat_seq_show, .start = kstat_seq_start, .next = kstat_seq_next, .stop = kstat_seq_stop, }; static kstat_module_t * kstat_find_module(char *name) { kstat_module_t *module = NULL; list_for_each_entry(module, &kstat_module_list, ksm_module_list) { if (strncmp(name, module->ksm_name, KSTAT_STRLEN) == 0) return (module); } return (NULL); } static kstat_module_t * kstat_create_module(char *name) { kstat_module_t *module; struct proc_dir_entry *pde; pde = proc_mkdir(name, proc_spl_kstat); if (pde == NULL) return (NULL); module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP); module->ksm_proc = pde; strlcpy(module->ksm_name, name, KSTAT_STRLEN+1); INIT_LIST_HEAD(&module->ksm_kstat_list); list_add_tail(&module->ksm_module_list, &kstat_module_list); return (module); } static void kstat_delete_module(kstat_module_t *module) { ASSERT(list_empty(&module->ksm_kstat_list)); remove_proc_entry(module->ksm_name, proc_spl_kstat); list_del(&module->ksm_module_list); kmem_free(module, sizeof (kstat_module_t)); } static int proc_kstat_open(struct inode *inode, struct file *filp) { struct seq_file *f; int rc; rc = seq_open(filp, &kstat_seq_ops); if (rc) return (rc); f = filp->private_data; f->private = PDE_DATA(inode); return (rc); } static ssize_t proc_kstat_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct seq_file *f = filp->private_data; kstat_t *ksp = f->private; int rc; ASSERT(ksp->ks_magic == KS_MAGIC); mutex_enter(ksp->ks_lock); rc = ksp->ks_update(ksp, KSTAT_WRITE); mutex_exit(ksp->ks_lock); if (rc) return (-rc); *ppos += len; return (len); } static const kstat_proc_op_t proc_kstat_operations = { #ifdef HAVE_PROC_OPS_STRUCT .proc_open = proc_kstat_open, .proc_write = proc_kstat_write, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release, #else .open = proc_kstat_open, .write = proc_kstat_write, .read = seq_read, .llseek = seq_lseek, .release = seq_release, #endif }; void __kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), int (*data)(char *buf, size_t size, void *data), void *(*addr)(kstat_t *ksp, loff_t index)) { ksp->ks_raw_ops.headers = headers; ksp->ks_raw_ops.data = data; ksp->ks_raw_ops.addr = addr; } EXPORT_SYMBOL(__kstat_set_raw_ops); void kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module, const char *name) { kpep->kpe_owner = NULL; kpep->kpe_proc = NULL; INIT_LIST_HEAD(&kpep->kpe_list); strncpy(kpep->kpe_module, module, KSTAT_STRLEN); strncpy(kpep->kpe_name, name, KSTAT_STRLEN); } EXPORT_SYMBOL(kstat_proc_entry_init); kstat_t * __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, const char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags) { kstat_t *ksp; ASSERT(ks_module); ASSERT(ks_instance == 0); ASSERT(ks_name); if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO)) ASSERT(ks_ndata == 1); ksp = kmem_zalloc(sizeof (*ksp), KM_SLEEP); if (ksp == NULL) return (ksp); mutex_enter(&kstat_module_lock); ksp->ks_kid = kstat_id; kstat_id++; mutex_exit(&kstat_module_lock); ksp->ks_magic = KS_MAGIC; mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL); ksp->ks_lock = &ksp->ks_private_lock; ksp->ks_crtime = gethrtime(); ksp->ks_snaptime = ksp->ks_crtime; ksp->ks_instance = ks_instance; strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN); ksp->ks_type = ks_type; ksp->ks_flags = ks_flags; ksp->ks_update = kstat_default_update; ksp->ks_private = NULL; ksp->ks_raw_ops.headers = NULL; ksp->ks_raw_ops.data = NULL; ksp->ks_raw_ops.addr = NULL; ksp->ks_raw_buf = NULL; ksp->ks_raw_bufsize = 0; kstat_proc_entry_init(&ksp->ks_proc, ks_module, ks_name); switch (ksp->ks_type) { case KSTAT_TYPE_RAW: ksp->ks_ndata = 1; ksp->ks_data_size = ks_ndata; break; case KSTAT_TYPE_NAMED: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t); break; case KSTAT_TYPE_INTR: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t); break; case KSTAT_TYPE_IO: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t); break; case KSTAT_TYPE_TIMER: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t); break; default: PANIC("Undefined kstat type %d\n", ksp->ks_type); } if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) { ksp->ks_data = NULL; } else { ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP); if (ksp->ks_data == NULL) { kmem_free(ksp, sizeof (*ksp)); ksp = NULL; } } return (ksp); } EXPORT_SYMBOL(__kstat_create); static int kstat_detect_collision(kstat_proc_entry_t *kpep) { kstat_module_t *module; kstat_proc_entry_t *tmp = NULL; char *parent; char *cp; parent = kmem_asprintf("%s", kpep->kpe_module); if ((cp = strrchr(parent, '/')) == NULL) { kmem_strfree(parent); return (0); } cp[0] = '\0'; if ((module = kstat_find_module(parent)) != NULL) { list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) { if (strncmp(tmp->kpe_name, cp+1, KSTAT_STRLEN) == 0) { kmem_strfree(parent); return (EEXIST); } } } kmem_strfree(parent); return (0); } /* * Add a file to the proc filesystem under the kstat namespace (i.e. * /proc/spl/kstat/). The file need not necessarily be implemented as a * kstat. */ void kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode, const kstat_proc_op_t *proc_ops, void *data) { kstat_module_t *module; kstat_proc_entry_t *tmp = NULL; ASSERT(kpep); mutex_enter(&kstat_module_lock); module = kstat_find_module(kpep->kpe_module); if (module == NULL) { if (kstat_detect_collision(kpep) != 0) { cmn_err(CE_WARN, "kstat_create('%s', '%s'): namespace" \ " collision", kpep->kpe_module, kpep->kpe_name); goto out; } module = kstat_create_module(kpep->kpe_module); if (module == NULL) goto out; } /* * Only one entry by this name per-module, on failure the module * shouldn't be deleted because we know it has at least one entry. */ list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) { if (strncmp(tmp->kpe_name, kpep->kpe_name, KSTAT_STRLEN) == 0) goto out; } list_add_tail(&kpep->kpe_list, &module->ksm_kstat_list); kpep->kpe_owner = module; kpep->kpe_proc = proc_create_data(kpep->kpe_name, mode, module->ksm_proc, proc_ops, data); if (kpep->kpe_proc == NULL) { list_del_init(&kpep->kpe_list); if (list_empty(&module->ksm_kstat_list)) kstat_delete_module(module); } out: mutex_exit(&kstat_module_lock); } EXPORT_SYMBOL(kstat_proc_entry_install); void __kstat_install(kstat_t *ksp) { ASSERT(ksp); mode_t mode; /* Specify permission modes for different kstats */ if (strncmp(ksp->ks_proc.kpe_name, "dbufs", KSTAT_STRLEN) == 0) { mode = 0600; } else { mode = 0644; } kstat_proc_entry_install( &ksp->ks_proc, mode, &proc_kstat_operations, ksp); } EXPORT_SYMBOL(__kstat_install); void kstat_proc_entry_delete(kstat_proc_entry_t *kpep) { kstat_module_t *module = kpep->kpe_owner; if (kpep->kpe_proc) remove_proc_entry(kpep->kpe_name, module->ksm_proc); mutex_enter(&kstat_module_lock); list_del_init(&kpep->kpe_list); /* * Remove top level module directory if it wasn't empty before, but now * is. */ if (kpep->kpe_proc && list_empty(&module->ksm_kstat_list)) kstat_delete_module(module); mutex_exit(&kstat_module_lock); } EXPORT_SYMBOL(kstat_proc_entry_delete); void __kstat_delete(kstat_t *ksp) { kstat_proc_entry_delete(&ksp->ks_proc); if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL)) kmem_free(ksp->ks_data, ksp->ks_data_size); ksp->ks_lock = NULL; mutex_destroy(&ksp->ks_private_lock); kmem_free(ksp, sizeof (*ksp)); } EXPORT_SYMBOL(__kstat_delete); int spl_kstat_init(void) { mutex_init(&kstat_module_lock, NULL, MUTEX_DEFAULT, NULL); INIT_LIST_HEAD(&kstat_module_list); kstat_id = 0; return (0); } void spl_kstat_fini(void) { ASSERT(list_empty(&kstat_module_list)); mutex_destroy(&kstat_module_lock); } Index: vendor-sys/openzfs/dist/module/os/linux/zfs/Makefile.in =================================================================== --- vendor-sys/openzfs/dist/module/os/linux/zfs/Makefile.in (revision 365339) +++ vendor-sys/openzfs/dist/module/os/linux/zfs/Makefile.in (revision 365340) @@ -1,37 +1,36 @@ # # Linux specific sources included from module/zfs/Makefile.in # # Suppress unused-value warnings in sparc64 architecture headers ccflags-$(CONFIG_SPARC64) += -Wno-unused-value $(MODULE)-objs += ../os/linux/zfs/abd_os.o $(MODULE)-objs += ../os/linux/zfs/arc_os.o $(MODULE)-objs += ../os/linux/zfs/mmp_os.o $(MODULE)-objs += ../os/linux/zfs/policy.o $(MODULE)-objs += ../os/linux/zfs/trace.o $(MODULE)-objs += ../os/linux/zfs/qat.o $(MODULE)-objs += ../os/linux/zfs/qat_compress.o $(MODULE)-objs += ../os/linux/zfs/qat_crypt.o $(MODULE)-objs += ../os/linux/zfs/spa_misc_os.o -$(MODULE)-objs += ../os/linux/zfs/spa_stats.o $(MODULE)-objs += ../os/linux/zfs/vdev_disk.o $(MODULE)-objs += ../os/linux/zfs/vdev_file.o $(MODULE)-objs += ../os/linux/zfs/zfs_acl.o $(MODULE)-objs += ../os/linux/zfs/zfs_ctldir.o $(MODULE)-objs += ../os/linux/zfs/zfs_debug.o $(MODULE)-objs += ../os/linux/zfs/zfs_dir.o $(MODULE)-objs += ../os/linux/zfs/zfs_file_os.o $(MODULE)-objs += ../os/linux/zfs/zfs_ioctl_os.o $(MODULE)-objs += ../os/linux/zfs/zfs_sysfs.o $(MODULE)-objs += ../os/linux/zfs/zfs_vfsops.o $(MODULE)-objs += ../os/linux/zfs/zfs_vnops.o $(MODULE)-objs += ../os/linux/zfs/zfs_znode.o $(MODULE)-objs += ../os/linux/zfs/zio_crypt.o $(MODULE)-objs += ../os/linux/zfs/zpl_ctldir.o $(MODULE)-objs += ../os/linux/zfs/zpl_export.o $(MODULE)-objs += ../os/linux/zfs/zpl_file.o $(MODULE)-objs += ../os/linux/zfs/zpl_inode.o $(MODULE)-objs += ../os/linux/zfs/zpl_super.o $(MODULE)-objs += ../os/linux/zfs/zpl_xattr.o $(MODULE)-objs += ../os/linux/zfs/zvol_os.o Index: vendor-sys/openzfs/dist/module/os/linux/zfs/arc_os.c =================================================================== --- vendor-sys/openzfs/dist/module/os/linux/zfs/arc_os.c (revision 365339) +++ vendor-sys/openzfs/dist/module/os/linux/zfs/arc_os.c (revision 365340) @@ -1,465 +1,464 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #endif #include #include #include #include #include #include #include /* * This is a limit on how many pages the ARC shrinker makes available for * eviction in response to one page allocation attempt. Note that in * practice, the kernel's shrinker can ask us to evict up to about 4x this * for one allocation attempt. * * The default limit of 10,000 (in practice, 160MB per allocation attempt * with 4K pages) limits the amount of time spent attempting to reclaim ARC * memory to less than 100ms per allocation attempt, even with a small * average compressed block size of ~8KB. * * See also the comment in arc_shrinker_count(). * Set to 0 to disable limit. */ int zfs_arc_shrinker_limit = 10000; /* * Return a default max arc size based on the amount of physical memory. */ uint64_t arc_default_max(uint64_t min, uint64_t allmem) { /* Default to 1/2 of all memory. */ return (MAX(allmem / 2, min)); } #ifdef _KERNEL /* * Return maximum amount of memory that we could possibly use. Reduced * to half of all memory in user space which is primarily used for testing. */ uint64_t arc_all_memory(void) { #ifdef CONFIG_HIGHMEM return (ptob(zfs_totalram_pages - zfs_totalhigh_pages)); #else return (ptob(zfs_totalram_pages)); #endif /* CONFIG_HIGHMEM */ } /* * Return the amount of memory that is considered free. In user space * which is primarily used for testing we pretend that free memory ranges * from 0-20% of all memory. */ uint64_t arc_free_memory(void) { #ifdef CONFIG_HIGHMEM struct sysinfo si; si_meminfo(&si); return (ptob(si.freeram - si.freehigh)); #else return (ptob(nr_free_pages() + - nr_inactive_file_pages() + - nr_slab_reclaimable_pages())); + nr_inactive_file_pages())); #endif /* CONFIG_HIGHMEM */ } /* * Return the amount of memory that can be consumed before reclaim will be * needed. Positive if there is sufficient free memory, negative indicates * the amount of memory that needs to be freed up. */ int64_t arc_available_memory(void) { return (arc_free_memory() - arc_sys_free); } static uint64_t arc_evictable_memory(void) { int64_t asize = aggsum_value(&arc_size); uint64_t arc_clean = zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) + zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) + zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) + zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); uint64_t arc_dirty = MAX((int64_t)asize - (int64_t)arc_clean, 0); /* * Scale reported evictable memory in proportion to page cache, cap * at specified min/max. */ uint64_t min = (ptob(nr_file_pages()) / 100) * zfs_arc_pc_percent; min = MAX(arc_c_min, MIN(arc_c_max, min)); if (arc_dirty >= min) return (arc_clean); return (MAX((int64_t)asize - (int64_t)min, 0)); } /* * The _count() function returns the number of free-able objects. * The _scan() function returns the number of objects that were freed. */ static unsigned long arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { /* * __GFP_FS won't be set if we are called from ZFS code (see * kmem_flags_convert(), which removes it). To avoid a deadlock, we * don't allow evicting in this case. We return 0 rather than * SHRINK_STOP so that the shrinker logic doesn't accumulate a * deficit against us. */ if (!(sc->gfp_mask & __GFP_FS)) { return (0); } /* * This code is reached in the "direct reclaim" case, where the * kernel (outside ZFS) is trying to allocate a page, and the system * is low on memory. * * The kernel's shrinker code doesn't understand how many pages the * ARC's callback actually frees, so it may ask the ARC to shrink a * lot for one page allocation. This is problematic because it may * take a long time, thus delaying the page allocation, and because * it may force the ARC to unnecessarily shrink very small. * * Therefore, we limit the amount of data that we say is evictable, * which limits the amount that the shrinker will ask us to evict for * one page allocation attempt. * * In practice, we may be asked to shrink 4x the limit to satisfy one * page allocation, before the kernel's shrinker code gives up on us. * When that happens, we rely on the kernel code to find the pages * that we freed before invoking the OOM killer. This happens in * __alloc_pages_slowpath(), which retries and finds the pages we * freed when it calls get_page_from_freelist(). * * See also the comment above zfs_arc_shrinker_limit. */ int64_t limit = zfs_arc_shrinker_limit != 0 ? zfs_arc_shrinker_limit : INT64_MAX; return (MIN(limit, btop((int64_t)arc_evictable_memory()))); } static unsigned long arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) { ASSERT((sc->gfp_mask & __GFP_FS) != 0); /* The arc is considered warm once reclaim has occurred */ if (unlikely(arc_warm == B_FALSE)) arc_warm = B_TRUE; /* * Evict the requested number of pages by reducing arc_c and waiting * for the requested amount of data to be evicted. */ arc_reduce_target_size(ptob(sc->nr_to_scan)); arc_wait_for_eviction(ptob(sc->nr_to_scan)); if (current->reclaim_state != NULL) current->reclaim_state->reclaimed_slab += sc->nr_to_scan; /* * We are experiencing memory pressure which the arc_evict_zthr was * unable to keep up with. Set arc_no_grow to briefly pause arc * growth to avoid compounding the memory pressure. */ arc_no_grow = B_TRUE; /* * When direct reclaim is observed it usually indicates a rapid * increase in memory pressure. This occurs because the kswapd * threads were unable to asynchronously keep enough free memory * available. */ if (current_is_kswapd()) { ARCSTAT_BUMP(arcstat_memory_indirect_count); } else { ARCSTAT_BUMP(arcstat_memory_direct_count); } return (sc->nr_to_scan); } SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS); int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) { uint64_t free_memory = arc_free_memory(); if (free_memory > arc_all_memory() * arc_lotsfree_percent / 100) return (0); if (txg > spa->spa_lowmem_last_txg) { spa->spa_lowmem_last_txg = txg; spa->spa_lowmem_page_load = 0; } /* * If we are in pageout, we know that memory is already tight, * the arc is already going to be evicting, so we just want to * continue to let page writes occur as quickly as possible. */ if (current_is_kswapd()) { if (spa->spa_lowmem_page_load > MAX(arc_sys_free / 4, free_memory) / 4) { DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); return (SET_ERROR(ERESTART)); } /* Note: reserve is inflated, so we deflate */ atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8); return (0); } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) { /* memory is low, delay before restarting */ ARCSTAT_INCR(arcstat_memory_throttle_count, 1); DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); return (SET_ERROR(EAGAIN)); } spa->spa_lowmem_page_load = 0; return (0); } void arc_lowmem_init(void) { uint64_t allmem = arc_all_memory(); /* * Register a shrinker to support synchronous (direct) memory * reclaim from the arc. This is done to prevent kswapd from * swapping out pages when it is preferable to shrink the arc. */ spl_register_shrinker(&arc_shrinker); /* * The ARC tries to keep at least this much memory available for the * system. This gives the ARC time to shrink in response to memory * pressure, before running completely out of memory and invoking the * direct-reclaim ARC shrinker. * * This should be more than twice high_wmark_pages(), so that * arc_wait_for_eviction() will wait until at least the * high_wmark_pages() are free (see arc_evict_state_impl()). * * Note: Even when the system is very low on memory, the kernel's * shrinker code may only ask for one "batch" of pages (512KB) to be * evicted. If concurrent allocations consume these pages, there may * still be insufficient free pages, and the OOM killer takes action. * * By setting arc_sys_free large enough, and having * arc_wait_for_eviction() wait until there is at least arc_sys_free/2 * free memory, it is much less likely that concurrent allocations can * consume all the memory that was evicted before checking for * OOM. * * It's hard to iterate the zones from a linux kernel module, which * makes it difficult to determine the watermark dynamically. Instead * we compute the maximum high watermark for this system, based * on the amount of memory, assuming default parameters on Linux kernel * 5.3. */ /* * Base wmark_low is 4 * the square root of Kbytes of RAM. */ long wmark = 4 * int_sqrt(allmem/1024) * 1024; /* * Clamp to between 128K and 64MB. */ wmark = MAX(wmark, 128 * 1024); wmark = MIN(wmark, 64 * 1024 * 1024); /* * watermark_boost can increase the wmark by up to 150%. */ wmark += wmark * 150 / 100; /* * arc_sys_free needs to be more than 2x the watermark, because * arc_wait_for_eviction() waits for half of arc_sys_free. Bump this up * to 3x to ensure we're above it. */ arc_sys_free = wmark * 3 + allmem / 32; } void arc_lowmem_fini(void) { spl_unregister_shrinker(&arc_shrinker); } int param_set_arc_long(const char *buf, zfs_kernel_param_t *kp) { int error; error = param_set_long(buf, kp); if (error < 0) return (SET_ERROR(error)); arc_tuning_update(B_TRUE); return (0); } int param_set_arc_int(const char *buf, zfs_kernel_param_t *kp) { int error; error = param_set_int(buf, kp); if (error < 0) return (SET_ERROR(error)); arc_tuning_update(B_TRUE); return (0); } #else /* _KERNEL */ int64_t arc_available_memory(void) { int64_t lowest = INT64_MAX; /* Every 100 calls, free a small amount */ if (spa_get_random(100) == 0) lowest = -1024; return (lowest); } int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) { return (0); } uint64_t arc_all_memory(void) { return (ptob(physmem) / 2); } uint64_t arc_free_memory(void) { return (spa_get_random(arc_all_memory() * 20 / 100)); } #endif /* _KERNEL */ /* * Helper function for arc_prune_async() it is responsible for safely * handling the execution of a registered arc_prune_func_t. */ static void arc_prune_task(void *ptr) { arc_prune_t *ap = (arc_prune_t *)ptr; arc_prune_func_t *func = ap->p_pfunc; if (func != NULL) func(ap->p_adjust, ap->p_private); zfs_refcount_remove(&ap->p_refcnt, func); } /* * Notify registered consumers they must drop holds on a portion of the ARC * buffered they reference. This provides a mechanism to ensure the ARC can * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This * is analogous to dnlc_reduce_cache() but more generic. * * This operation is performed asynchronously so it may be safely called * in the context of the arc_reclaim_thread(). A reference is taken here * for each registered arc_prune_t and the arc_prune_task() is responsible * for releasing it once the registered arc_prune_func_t has completed. */ void arc_prune_async(int64_t adjust) { arc_prune_t *ap; mutex_enter(&arc_prune_mtx); for (ap = list_head(&arc_prune_list); ap != NULL; ap = list_next(&arc_prune_list, ap)) { if (zfs_refcount_count(&ap->p_refcnt) >= 2) continue; zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc); ap->p_adjust = adjust; if (taskq_dispatch(arc_prune_taskq, arc_prune_task, ap, TQ_SLEEP) == TASKQID_INVALID) { zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc); continue; } ARCSTAT_BUMP(arcstat_prune); } mutex_exit(&arc_prune_mtx); } /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW, "Limit on number of pages that ARC shrinker can reclaim at once"); /* END CSTYLED */ Index: vendor-sys/openzfs/dist/module/os/linux/zfs/spa_stats.c =================================================================== --- vendor-sys/openzfs/dist/module/os/linux/zfs/spa_stats.c (revision 365339) +++ vendor-sys/openzfs/dist/module/os/linux/zfs/spa_stats.c (revision 365340) @@ -1,1047 +1,1043 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #include #include #include #include #include /* * Keeps stats on last N reads per spa_t, disabled by default. */ int zfs_read_history = 0; /* * Include cache hits in history, disabled by default. */ int zfs_read_history_hits = 0; /* * Keeps stats on the last 100 txgs by default. */ int zfs_txg_history = 100; /* * Keeps stats on the last N MMP updates, disabled by default. */ int zfs_multihost_history = 0; /* * ========================================================================== * SPA Read History Routines * ========================================================================== */ /* * Read statistics - Information exported regarding each arc_read call */ typedef struct spa_read_history { hrtime_t start; /* time read completed */ uint64_t objset; /* read from this objset */ uint64_t object; /* read of this object number */ uint64_t level; /* block's indirection level */ uint64_t blkid; /* read of this block id */ char origin[24]; /* read originated from here */ uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ pid_t pid; /* PID of task doing read */ char comm[16]; /* process name of task doing read */ procfs_list_node_t srh_node; } spa_read_history_t; static int spa_read_history_show_header(struct seq_file *f) { seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", "level", "blkid", "aflags", "origin", "pid", "process"); return (0); } static int spa_read_history_show(struct seq_file *f, void *data) { spa_read_history_t *srh = (spa_read_history_t *)data; seq_printf(f, "%-8llu %-16llu 0x%-6llx " "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", (u_longlong_t)srh->srh_node.pln_id, srh->start, (longlong_t)srh->objset, (longlong_t)srh->object, (longlong_t)srh->level, (longlong_t)srh->blkid, srh->aflags, srh->origin, srh->pid, srh->comm); return (0); } /* Remove oldest elements from list until there are no more than 'size' left */ static void spa_read_history_truncate(spa_history_list_t *shl, unsigned int size) { spa_read_history_t *srh; while (shl->size > size) { srh = list_remove_head(&shl->procfs_list.pl_list); ASSERT3P(srh, !=, NULL); kmem_free(srh, sizeof (spa_read_history_t)); shl->size--; } if (size == 0) ASSERT(list_is_empty(&shl->procfs_list.pl_list)); } static int spa_read_history_clear(procfs_list_t *procfs_list) { spa_history_list_t *shl = procfs_list->pl_private; mutex_enter(&procfs_list->pl_lock); spa_read_history_truncate(shl, 0); mutex_exit(&procfs_list->pl_lock); return (0); } static void spa_read_history_init(spa_t *spa) { spa_history_list_t *shl = &spa->spa_stats.read_history; char *module; shl->size = 0; module = kmem_asprintf("zfs/%s", spa_name(spa)); shl->procfs_list.pl_private = shl; procfs_list_install(module, "reads", 0600, &shl->procfs_list, spa_read_history_show, spa_read_history_show_header, spa_read_history_clear, offsetof(spa_read_history_t, srh_node)); kmem_strfree(module); } static void spa_read_history_destroy(spa_t *spa) { spa_history_list_t *shl = &spa->spa_stats.read_history; procfs_list_uninstall(&shl->procfs_list); spa_read_history_truncate(shl, 0); procfs_list_destroy(&shl->procfs_list); } void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) { spa_history_list_t *shl = &spa->spa_stats.read_history; spa_read_history_t *srh; ASSERT3P(spa, !=, NULL); ASSERT3P(zb, !=, NULL); if (zfs_read_history == 0 && shl->size == 0) return; if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED)) return; srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP); strlcpy(srh->comm, getcomm(), sizeof (srh->comm)); srh->start = gethrtime(); srh->objset = zb->zb_objset; srh->object = zb->zb_object; srh->level = zb->zb_level; srh->blkid = zb->zb_blkid; srh->aflags = aflags; srh->pid = getpid(); mutex_enter(&shl->procfs_list.pl_lock); procfs_list_add(&shl->procfs_list, srh); shl->size++; spa_read_history_truncate(shl, zfs_read_history); mutex_exit(&shl->procfs_list.pl_lock); } /* * ========================================================================== * SPA TXG History Routines * ========================================================================== */ /* * Txg statistics - Information exported regarding each txg sync */ typedef struct spa_txg_history { uint64_t txg; /* txg id */ txg_state_t state; /* active txg state */ uint64_t nread; /* number of bytes read */ uint64_t nwritten; /* number of bytes written */ uint64_t reads; /* number of read operations */ uint64_t writes; /* number of write operations */ uint64_t ndirty; /* number of dirty bytes */ hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ procfs_list_node_t sth_node; } spa_txg_history_t; static int spa_txg_history_show_header(struct seq_file *f) { seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s " "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state", "ndirty", "nread", "nwritten", "reads", "writes", "otime", "qtime", "wtime", "stime"); return (0); } static int spa_txg_history_show(struct seq_file *f, void *data) { spa_txg_history_t *sth = (spa_txg_history_t *)data; uint64_t open = 0, quiesce = 0, wait = 0, sync = 0; char state; switch (sth->state) { case TXG_STATE_BIRTH: state = 'B'; break; case TXG_STATE_OPEN: state = 'O'; break; case TXG_STATE_QUIESCED: state = 'Q'; break; case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break; case TXG_STATE_SYNCED: state = 'S'; break; case TXG_STATE_COMMITTED: state = 'C'; break; default: state = '?'; break; } if (sth->times[TXG_STATE_OPEN]) open = sth->times[TXG_STATE_OPEN] - sth->times[TXG_STATE_BIRTH]; if (sth->times[TXG_STATE_QUIESCED]) quiesce = sth->times[TXG_STATE_QUIESCED] - sth->times[TXG_STATE_OPEN]; if (sth->times[TXG_STATE_WAIT_FOR_SYNC]) wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] - sth->times[TXG_STATE_QUIESCED]; if (sth->times[TXG_STATE_SYNCED]) sync = sth->times[TXG_STATE_SYNCED] - sth->times[TXG_STATE_WAIT_FOR_SYNC]; seq_printf(f, "%-8llu %-16llu %-5c %-12llu " "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n", (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, (u_longlong_t)sth->ndirty, (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten, (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait, (u_longlong_t)sync); return (0); } /* Remove oldest elements from list until there are no more than 'size' left */ static void spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size) { spa_txg_history_t *sth; while (shl->size > size) { sth = list_remove_head(&shl->procfs_list.pl_list); ASSERT3P(sth, !=, NULL); kmem_free(sth, sizeof (spa_txg_history_t)); shl->size--; } if (size == 0) ASSERT(list_is_empty(&shl->procfs_list.pl_list)); } static int spa_txg_history_clear(procfs_list_t *procfs_list) { spa_history_list_t *shl = procfs_list->pl_private; mutex_enter(&procfs_list->pl_lock); spa_txg_history_truncate(shl, 0); mutex_exit(&procfs_list->pl_lock); return (0); } static void spa_txg_history_init(spa_t *spa) { spa_history_list_t *shl = &spa->spa_stats.txg_history; char *module; shl->size = 0; module = kmem_asprintf("zfs/%s", spa_name(spa)); shl->procfs_list.pl_private = shl; procfs_list_install(module, "txgs", 0644, &shl->procfs_list, spa_txg_history_show, spa_txg_history_show_header, spa_txg_history_clear, offsetof(spa_txg_history_t, sth_node)); kmem_strfree(module); } static void spa_txg_history_destroy(spa_t *spa) { spa_history_list_t *shl = &spa->spa_stats.txg_history; procfs_list_uninstall(&shl->procfs_list); spa_txg_history_truncate(shl, 0); procfs_list_destroy(&shl->procfs_list); } /* * Add a new txg to historical record. */ void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) { spa_history_list_t *shl = &spa->spa_stats.txg_history; spa_txg_history_t *sth; if (zfs_txg_history == 0 && shl->size == 0) return; sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP); sth->txg = txg; sth->state = TXG_STATE_OPEN; sth->times[TXG_STATE_BIRTH] = birth_time; mutex_enter(&shl->procfs_list.pl_lock); procfs_list_add(&shl->procfs_list, sth); shl->size++; spa_txg_history_truncate(shl, zfs_txg_history); mutex_exit(&shl->procfs_list.pl_lock); } /* * Set txg state completion time and increment current state. */ int spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, hrtime_t completed_time) { spa_history_list_t *shl = &spa->spa_stats.txg_history; spa_txg_history_t *sth; int error = ENOENT; if (zfs_txg_history == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; sth = list_prev(&shl->procfs_list.pl_list, sth)) { if (sth->txg == txg) { sth->times[completed_state] = completed_time; sth->state++; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } /* * Set txg IO stats. */ static int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty) { spa_history_list_t *shl = &spa->spa_stats.txg_history; spa_txg_history_t *sth; int error = ENOENT; if (zfs_txg_history == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; sth = list_prev(&shl->procfs_list.pl_list, sth)) { if (sth->txg == txg) { sth->nread = nread; sth->nwritten = nwritten; sth->reads = reads; sth->writes = writes; sth->ndirty = ndirty; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } txg_stat_t * spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp) { txg_stat_t *ts; if (zfs_txg_history == 0) return (NULL); ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_get_stats(spa->spa_root_vdev, &ts->vs1); spa_config_exit(spa, SCL_CONFIG, FTAG); ts->txg = txg; ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime()); return (ts); } void spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts) { if (ts == NULL) return; if (zfs_txg_history == 0) { kmem_free(ts, sizeof (txg_stat_t)); return; } spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_get_stats(spa->spa_root_vdev, &ts->vs2); spa_config_exit(spa, SCL_CONFIG, FTAG); spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime()); spa_txg_history_set_io(spa, ts->txg, ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ], ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE], ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ], ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE], ts->ndirty); kmem_free(ts, sizeof (txg_stat_t)); } /* * ========================================================================== * SPA TX Assign Histogram Routines * ========================================================================== */ /* * Tx statistics - Information exported regarding dmu_tx_assign time. */ /* * When the kstat is written zero all buckets. When the kstat is read * count the number of trailing buckets set to zero and update ks_ndata * such that they are not output. */ static int spa_tx_assign_update(kstat_t *ksp, int rw) { spa_t *spa = ksp->ks_private; spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; int i; if (rw == KSTAT_WRITE) { for (i = 0; i < shk->count; i++) ((kstat_named_t *)shk->priv)[i].value.ui64 = 0; } for (i = shk->count; i > 0; i--) if (((kstat_named_t *)shk->priv)[i-1].value.ui64 != 0) break; ksp->ks_ndata = i; ksp->ks_data_size = i * sizeof (kstat_named_t); return (0); } static void spa_tx_assign_init(spa_t *spa) { spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; char *name; kstat_named_t *ks; kstat_t *ksp; int i; mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); shk->count = 42; /* power of two buckets for 1ns to 2,199s */ shk->size = shk->count * sizeof (kstat_named_t); shk->priv = kmem_alloc(shk->size, KM_SLEEP); name = kmem_asprintf("zfs/%s", spa_name(spa)); for (i = 0; i < shk->count; i++) { ks = &((kstat_named_t *)shk->priv)[i]; ks->data_type = KSTAT_DATA_UINT64; ks->value.ui64 = 0; (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", (u_longlong_t)1 << i); } ksp = kstat_create(name, 0, "dmu_tx_assign", "misc", KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); shk->kstat = ksp; if (ksp) { ksp->ks_lock = &shk->lock; ksp->ks_data = shk->priv; ksp->ks_ndata = shk->count; ksp->ks_data_size = shk->size; ksp->ks_private = spa; ksp->ks_update = spa_tx_assign_update; kstat_install(ksp); } kmem_strfree(name); } static void spa_tx_assign_destroy(spa_t *spa) { spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; kstat_t *ksp; ksp = shk->kstat; if (ksp) kstat_delete(ksp); kmem_free(shk->priv, shk->size); mutex_destroy(&shk->lock); } void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) { spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; uint64_t idx = 0; while (((1ULL << idx) < nsecs) && (idx < shk->size - 1)) idx++; atomic_inc_64(&((kstat_named_t *)shk->priv)[idx].value.ui64); } /* * ========================================================================== * SPA IO History Routines * ========================================================================== */ static int spa_io_history_update(kstat_t *ksp, int rw) { if (rw == KSTAT_WRITE) memset(ksp->ks_data, 0, ksp->ks_data_size); return (0); } static void spa_io_history_init(spa_t *spa) { spa_history_kstat_t *shk = &spa->spa_stats.io_history; char *name; kstat_t *ksp; mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); name = kmem_asprintf("zfs/%s", spa_name(spa)); ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0); shk->kstat = ksp; if (ksp) { ksp->ks_lock = &shk->lock; ksp->ks_private = spa; ksp->ks_update = spa_io_history_update; kstat_install(ksp); } kmem_strfree(name); } static void spa_io_history_destroy(spa_t *spa) { spa_history_kstat_t *shk = &spa->spa_stats.io_history; if (shk->kstat) kstat_delete(shk->kstat); mutex_destroy(&shk->lock); } /* * ========================================================================== * SPA MMP History Routines * ========================================================================== */ /* * MMP statistics - Information exported regarding attempted MMP writes * For MMP writes issued, fields used as per comments below. * For MMP writes skipped, an entry represents a span of time when * writes were skipped for same reason (error from mmp_random_leaf). * Differences are: * timestamp time first write skipped, if >1 skipped in a row * mmp_delay delay value at timestamp * vdev_guid number of writes skipped * io_error one of enum mmp_error * duration time span (ns) of skipped writes */ typedef struct spa_mmp_history { uint64_t mmp_node_id; /* unique # for updates */ uint64_t txg; /* txg of last sync */ uint64_t timestamp; /* UTC time MMP write issued */ uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */ uint64_t vdev_guid; /* unique ID of leaf vdev */ char *vdev_path; int vdev_label; /* vdev label */ int io_error; /* error status of MMP write */ hrtime_t error_start; /* hrtime of start of error period */ hrtime_t duration; /* time from submission to completion */ procfs_list_node_t smh_node; } spa_mmp_history_t; static int spa_mmp_history_show_header(struct seq_file *f) { seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s " "%-10s %s\n", "id", "txg", "timestamp", "error", "duration", "mmp_delay", "vdev_guid", "vdev_label", "vdev_path"); return (0); } static int spa_mmp_history_show(struct seq_file *f, void *data) { spa_mmp_history_t *smh = (spa_mmp_history_t *)data; char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu " "%-10lld %s\n"; char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu " "%-10lld %s\n"; seq_printf(f, (smh->error_start ? skip_fmt : write_fmt), (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg, (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error, (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay, (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label, (smh->vdev_path ? smh->vdev_path : "-")); return (0); } /* Remove oldest elements from list until there are no more than 'size' left */ static void spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size) { spa_mmp_history_t *smh; while (shl->size > size) { smh = list_remove_head(&shl->procfs_list.pl_list); if (smh->vdev_path) kmem_strfree(smh->vdev_path); kmem_free(smh, sizeof (spa_mmp_history_t)); shl->size--; } if (size == 0) ASSERT(list_is_empty(&shl->procfs_list.pl_list)); } static int spa_mmp_history_clear(procfs_list_t *procfs_list) { spa_history_list_t *shl = procfs_list->pl_private; mutex_enter(&procfs_list->pl_lock); spa_mmp_history_truncate(shl, 0); mutex_exit(&procfs_list->pl_lock); return (0); } static void spa_mmp_history_init(spa_t *spa) { spa_history_list_t *shl = &spa->spa_stats.mmp_history; char *module; shl->size = 0; module = kmem_asprintf("zfs/%s", spa_name(spa)); shl->procfs_list.pl_private = shl; procfs_list_install(module, "multihost", 0644, &shl->procfs_list, spa_mmp_history_show, spa_mmp_history_show_header, spa_mmp_history_clear, offsetof(spa_mmp_history_t, smh_node)); kmem_strfree(module); } static void spa_mmp_history_destroy(spa_t *spa) { spa_history_list_t *shl = &spa->spa_stats.mmp_history; procfs_list_uninstall(&shl->procfs_list); spa_mmp_history_truncate(shl, 0); procfs_list_destroy(&shl->procfs_list); } /* * Set duration in existing "skip" record to how long we have waited for a leaf * vdev to become available. * * Important that we start search at the tail of the list where new * records are inserted, so this is normally an O(1) operation. */ int spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id) { spa_history_list_t *shl = &spa->spa_stats.mmp_history; spa_mmp_history_t *smh; int error = ENOENT; if (zfs_multihost_history == 0 && shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; smh = list_prev(&shl->procfs_list.pl_list, smh)) { if (smh->mmp_node_id == mmp_node_id) { ASSERT3U(smh->io_error, !=, 0); smh->duration = gethrtime() - smh->error_start; smh->vdev_guid++; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } /* * Set MMP write duration and error status in existing record. * See comment re: search order above spa_mmp_history_set_skip(). */ int spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, hrtime_t duration) { spa_history_list_t *shl = &spa->spa_stats.mmp_history; spa_mmp_history_t *smh; int error = ENOENT; if (zfs_multihost_history == 0 && shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; smh = list_prev(&shl->procfs_list.pl_list, smh)) { if (smh->mmp_node_id == mmp_node_id) { ASSERT(smh->io_error == 0); smh->io_error = io_error; smh->duration = duration; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } /* * Add a new MMP historical record. * error == 0 : a write was issued. * error != 0 : a write was not issued because no leaves were found. */ void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id, int error) { spa_history_list_t *shl = &spa->spa_stats.mmp_history; spa_mmp_history_t *smh; if (zfs_multihost_history == 0 && shl->size == 0) return; smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP); smh->txg = txg; smh->timestamp = timestamp; smh->mmp_delay = mmp_delay; if (vd) { smh->vdev_guid = vd->vdev_guid; if (vd->vdev_path) smh->vdev_path = kmem_strdup(vd->vdev_path); } smh->vdev_label = label; smh->mmp_node_id = mmp_node_id; if (error) { smh->io_error = error; smh->error_start = gethrtime(); smh->vdev_guid = 1; } mutex_enter(&shl->procfs_list.pl_lock); procfs_list_add(&shl->procfs_list, smh); shl->size++; spa_mmp_history_truncate(shl, zfs_multihost_history); mutex_exit(&shl->procfs_list.pl_lock); } static void * spa_state_addr(kstat_t *ksp, loff_t n) { - return (ksp->ks_private); /* return the spa_t */ + if (n == 0) + return (ksp->ks_private); /* return the spa_t */ + return (NULL); } static int spa_state_data(char *buf, size_t size, void *data) { spa_t *spa = (spa_t *)data; (void) snprintf(buf, size, "%s\n", spa_state_to_name(spa)); return (0); } /* * Return the state of the pool in /proc/spl/kstat/zfs//state. * * This is a lock-less read of the pool's state (unlike using 'zpool', which * can potentially block for seconds). Because it doesn't block, it can useful * as a pool heartbeat value. */ static void spa_state_init(spa_t *spa) { spa_history_kstat_t *shk = &spa->spa_stats.state; char *name; kstat_t *ksp; mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); name = kmem_asprintf("zfs/%s", spa_name(spa)); ksp = kstat_create(name, 0, "state", "misc", KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); shk->kstat = ksp; if (ksp) { ksp->ks_lock = &shk->lock; ksp->ks_data = NULL; ksp->ks_private = spa; ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS; kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr); kstat_install(ksp); } kmem_strfree(name); } static void spa_health_destroy(spa_t *spa) { spa_history_kstat_t *shk = &spa->spa_stats.state; kstat_t *ksp = shk->kstat; if (ksp) kstat_delete(ksp); mutex_destroy(&shk->lock); } static spa_iostats_t spa_iostats_template = { { "trim_extents_written", KSTAT_DATA_UINT64 }, { "trim_bytes_written", KSTAT_DATA_UINT64 }, { "trim_extents_skipped", KSTAT_DATA_UINT64 }, { "trim_bytes_skipped", KSTAT_DATA_UINT64 }, { "trim_extents_failed", KSTAT_DATA_UINT64 }, { "trim_bytes_failed", KSTAT_DATA_UINT64 }, { "autotrim_extents_written", KSTAT_DATA_UINT64 }, { "autotrim_bytes_written", KSTAT_DATA_UINT64 }, { "autotrim_extents_skipped", KSTAT_DATA_UINT64 }, { "autotrim_bytes_skipped", KSTAT_DATA_UINT64 }, { "autotrim_extents_failed", KSTAT_DATA_UINT64 }, { "autotrim_bytes_failed", KSTAT_DATA_UINT64 }, { "simple_trim_extents_written", KSTAT_DATA_UINT64 }, { "simple_trim_bytes_written", KSTAT_DATA_UINT64 }, { "simple_trim_extents_skipped", KSTAT_DATA_UINT64 }, { "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 }, { "simple_trim_extents_failed", KSTAT_DATA_UINT64 }, { "simple_trim_bytes_failed", KSTAT_DATA_UINT64 }, }; #define SPA_IOSTATS_ADD(stat, val) \ atomic_add_64(&iostats->stat.value.ui64, (val)); void spa_iostats_trim_add(spa_t *spa, trim_type_t type, uint64_t extents_written, uint64_t bytes_written, uint64_t extents_skipped, uint64_t bytes_skipped, uint64_t extents_failed, uint64_t bytes_failed) { spa_history_kstat_t *shk = &spa->spa_stats.iostats; kstat_t *ksp = shk->kstat; spa_iostats_t *iostats; if (ksp == NULL) return; iostats = ksp->ks_data; if (type == TRIM_TYPE_MANUAL) { SPA_IOSTATS_ADD(trim_extents_written, extents_written); SPA_IOSTATS_ADD(trim_bytes_written, bytes_written); SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped); SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped); SPA_IOSTATS_ADD(trim_extents_failed, extents_failed); SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed); } else if (type == TRIM_TYPE_AUTO) { SPA_IOSTATS_ADD(autotrim_extents_written, extents_written); SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written); SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped); SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped); SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed); SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed); } else { SPA_IOSTATS_ADD(simple_trim_extents_written, extents_written); SPA_IOSTATS_ADD(simple_trim_bytes_written, bytes_written); SPA_IOSTATS_ADD(simple_trim_extents_skipped, extents_skipped); SPA_IOSTATS_ADD(simple_trim_bytes_skipped, bytes_skipped); SPA_IOSTATS_ADD(simple_trim_extents_failed, extents_failed); SPA_IOSTATS_ADD(simple_trim_bytes_failed, bytes_failed); } } static int spa_iostats_update(kstat_t *ksp, int rw) { if (rw == KSTAT_WRITE) { memcpy(ksp->ks_data, &spa_iostats_template, sizeof (spa_iostats_t)); } return (0); } static void spa_iostats_init(spa_t *spa) { spa_history_kstat_t *shk = &spa->spa_stats.iostats; mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); char *name = kmem_asprintf("zfs/%s", spa_name(spa)); kstat_t *ksp = kstat_create(name, 0, "iostats", "misc", KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); shk->kstat = ksp; if (ksp) { int size = sizeof (spa_iostats_t); ksp->ks_lock = &shk->lock; ksp->ks_private = spa; ksp->ks_update = spa_iostats_update; ksp->ks_data = kmem_alloc(size, KM_SLEEP); memcpy(ksp->ks_data, &spa_iostats_template, size); kstat_install(ksp); } kmem_strfree(name); } static void spa_iostats_destroy(spa_t *spa) { spa_history_kstat_t *shk = &spa->spa_stats.iostats; kstat_t *ksp = shk->kstat; if (ksp) { kmem_free(ksp->ks_data, sizeof (spa_iostats_t)); kstat_delete(ksp); } mutex_destroy(&shk->lock); } void spa_stats_init(spa_t *spa) { spa_read_history_init(spa); spa_txg_history_init(spa); spa_tx_assign_init(spa); spa_io_history_init(spa); spa_mmp_history_init(spa); spa_state_init(spa); spa_iostats_init(spa); } void spa_stats_destroy(spa_t *spa) { spa_iostats_destroy(spa); spa_health_destroy(spa); spa_tx_assign_destroy(spa); spa_txg_history_destroy(spa); spa_read_history_destroy(spa); spa_io_history_destroy(spa); spa_mmp_history_destroy(spa); } -#if defined(_KERNEL) -/* CSTYLED */ -module_param(zfs_read_history, int, 0644); -MODULE_PARM_DESC(zfs_read_history, - "Historical statistics for the last N reads"); +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, read_history, INT, ZMOD_RW, + "Historical statistics for the last N reads"); -module_param(zfs_read_history_hits, int, 0644); -MODULE_PARM_DESC(zfs_read_history_hits, - "Include cache hits in read history"); +ZFS_MODULE_PARAM(zfs, zfs_, read_history_hits, INT, ZMOD_RW, + "Include cache hits in read history"); -module_param(zfs_txg_history, int, 0644); -MODULE_PARM_DESC(zfs_txg_history, - "Historical statistics for the last N txgs"); +ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, history, INT, ZMOD_RW, + "Historical statistics for the last N txgs"); -module_param(zfs_multihost_history, int, 0644); -MODULE_PARM_DESC(zfs_multihost_history, - "Historical statistics for last N multihost writes"); +ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, history, INT, ZMOD_RW, + "Historical statistics for last N multihost writes"); /* END CSTYLED */ -#endif Index: vendor-sys/openzfs/dist/module/os/linux/zfs/zfs_vfsops.c =================================================================== --- vendor-sys/openzfs/dist/module/os/linux/zfs/zfs_vfsops.c (revision 365339) +++ vendor-sys/openzfs/dist/module/os/linux/zfs/zfs_vfsops.c (revision 365340) @@ -1,2165 +1,2175 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_comutil.h" enum { TOKEN_RO, TOKEN_RW, TOKEN_SETUID, TOKEN_NOSETUID, TOKEN_EXEC, TOKEN_NOEXEC, TOKEN_DEVICES, TOKEN_NODEVICES, TOKEN_DIRXATTR, TOKEN_SAXATTR, TOKEN_XATTR, TOKEN_NOXATTR, TOKEN_ATIME, TOKEN_NOATIME, TOKEN_RELATIME, TOKEN_NORELATIME, TOKEN_NBMAND, TOKEN_NONBMAND, TOKEN_MNTPOINT, TOKEN_LAST, }; static const match_table_t zpl_tokens = { { TOKEN_RO, MNTOPT_RO }, { TOKEN_RW, MNTOPT_RW }, { TOKEN_SETUID, MNTOPT_SETUID }, { TOKEN_NOSETUID, MNTOPT_NOSETUID }, { TOKEN_EXEC, MNTOPT_EXEC }, { TOKEN_NOEXEC, MNTOPT_NOEXEC }, { TOKEN_DEVICES, MNTOPT_DEVICES }, { TOKEN_NODEVICES, MNTOPT_NODEVICES }, { TOKEN_DIRXATTR, MNTOPT_DIRXATTR }, { TOKEN_SAXATTR, MNTOPT_SAXATTR }, { TOKEN_XATTR, MNTOPT_XATTR }, { TOKEN_NOXATTR, MNTOPT_NOXATTR }, { TOKEN_ATIME, MNTOPT_ATIME }, { TOKEN_NOATIME, MNTOPT_NOATIME }, { TOKEN_RELATIME, MNTOPT_RELATIME }, { TOKEN_NORELATIME, MNTOPT_NORELATIME }, { TOKEN_NBMAND, MNTOPT_NBMAND }, { TOKEN_NONBMAND, MNTOPT_NONBMAND }, { TOKEN_MNTPOINT, MNTOPT_MNTPOINT "=%s" }, { TOKEN_LAST, NULL }, }; static void zfsvfs_vfs_free(vfs_t *vfsp) { if (vfsp != NULL) { if (vfsp->vfs_mntpoint != NULL) kmem_strfree(vfsp->vfs_mntpoint); kmem_free(vfsp, sizeof (vfs_t)); } } static int zfsvfs_parse_option(char *option, int token, substring_t *args, vfs_t *vfsp) { switch (token) { case TOKEN_RO: vfsp->vfs_readonly = B_TRUE; vfsp->vfs_do_readonly = B_TRUE; break; case TOKEN_RW: vfsp->vfs_readonly = B_FALSE; vfsp->vfs_do_readonly = B_TRUE; break; case TOKEN_SETUID: vfsp->vfs_setuid = B_TRUE; vfsp->vfs_do_setuid = B_TRUE; break; case TOKEN_NOSETUID: vfsp->vfs_setuid = B_FALSE; vfsp->vfs_do_setuid = B_TRUE; break; case TOKEN_EXEC: vfsp->vfs_exec = B_TRUE; vfsp->vfs_do_exec = B_TRUE; break; case TOKEN_NOEXEC: vfsp->vfs_exec = B_FALSE; vfsp->vfs_do_exec = B_TRUE; break; case TOKEN_DEVICES: vfsp->vfs_devices = B_TRUE; vfsp->vfs_do_devices = B_TRUE; break; case TOKEN_NODEVICES: vfsp->vfs_devices = B_FALSE; vfsp->vfs_do_devices = B_TRUE; break; case TOKEN_DIRXATTR: vfsp->vfs_xattr = ZFS_XATTR_DIR; vfsp->vfs_do_xattr = B_TRUE; break; case TOKEN_SAXATTR: vfsp->vfs_xattr = ZFS_XATTR_SA; vfsp->vfs_do_xattr = B_TRUE; break; case TOKEN_XATTR: vfsp->vfs_xattr = ZFS_XATTR_DIR; vfsp->vfs_do_xattr = B_TRUE; break; case TOKEN_NOXATTR: vfsp->vfs_xattr = ZFS_XATTR_OFF; vfsp->vfs_do_xattr = B_TRUE; break; case TOKEN_ATIME: vfsp->vfs_atime = B_TRUE; vfsp->vfs_do_atime = B_TRUE; break; case TOKEN_NOATIME: vfsp->vfs_atime = B_FALSE; vfsp->vfs_do_atime = B_TRUE; break; case TOKEN_RELATIME: vfsp->vfs_relatime = B_TRUE; vfsp->vfs_do_relatime = B_TRUE; break; case TOKEN_NORELATIME: vfsp->vfs_relatime = B_FALSE; vfsp->vfs_do_relatime = B_TRUE; break; case TOKEN_NBMAND: vfsp->vfs_nbmand = B_TRUE; vfsp->vfs_do_nbmand = B_TRUE; break; case TOKEN_NONBMAND: vfsp->vfs_nbmand = B_FALSE; vfsp->vfs_do_nbmand = B_TRUE; break; case TOKEN_MNTPOINT: vfsp->vfs_mntpoint = match_strdup(&args[0]); if (vfsp->vfs_mntpoint == NULL) return (SET_ERROR(ENOMEM)); break; default: break; } return (0); } /* * Parse the raw mntopts and return a vfs_t describing the options. */ static int zfsvfs_parse_options(char *mntopts, vfs_t **vfsp) { vfs_t *tmp_vfsp; int error; tmp_vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP); if (mntopts != NULL) { substring_t args[MAX_OPT_ARGS]; char *tmp_mntopts, *p, *t; int token; tmp_mntopts = t = kmem_strdup(mntopts); if (tmp_mntopts == NULL) return (SET_ERROR(ENOMEM)); while ((p = strsep(&t, ",")) != NULL) { if (!*p) continue; args[0].to = args[0].from = NULL; token = match_token(p, zpl_tokens, args); error = zfsvfs_parse_option(p, token, args, tmp_vfsp); if (error) { kmem_strfree(tmp_mntopts); zfsvfs_vfs_free(tmp_vfsp); return (error); } } kmem_strfree(tmp_mntopts); } *vfsp = tmp_vfsp; return (0); } boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs) { return (!!(zfsvfs->z_sb->s_flags & SB_RDONLY)); } /*ARGSUSED*/ int zfs_sync(struct super_block *sb, int wait, cred_t *cr) { zfsvfs_t *zfsvfs = sb->s_fs_info; /* * Semantically, the only requirement is that the sync be initiated. * The DMU syncs out txgs frequently, so there's nothing to do. */ if (!wait) return (0); if (zfsvfs != NULL) { /* * Sync a specific filesystem. */ dsl_pool_t *dp; ZFS_ENTER(zfsvfs); dp = dmu_objset_pool(zfsvfs->z_os); /* * If the system is shutting down, then skip any * filesystems which may exist on a suspended pool. */ if (spa_suspended(dp->dp_spa)) { ZFS_EXIT(zfsvfs); return (0); } if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, 0); ZFS_EXIT(zfsvfs); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(1M). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); } static void atime_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; struct super_block *sb = zfsvfs->z_sb; if (sb == NULL) return; /* * Update SB_NOATIME bit in VFS super block. Since atime update is * determined by atime_needs_update(), atime_needs_update() needs to * return false if atime is turned off, and not unconditionally return * false if atime is turned on. */ if (newval) sb->s_flags &= ~SB_NOATIME; else sb->s_flags |= SB_NOATIME; } static void relatime_changed_cb(void *arg, uint64_t newval) { ((zfsvfs_t *)arg)->z_relatime = newval; } static void xattr_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == ZFS_XATTR_OFF) { zfsvfs->z_flags &= ~ZSB_XATTR; } else { zfsvfs->z_flags |= ZSB_XATTR; if (newval == ZFS_XATTR_SA) zfsvfs->z_xattr_sa = B_TRUE; else zfsvfs->z_xattr_sa = B_FALSE; } } static void acltype_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; switch (newval) { case ZFS_ACLTYPE_OFF: zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; break; case ZFS_ACLTYPE_POSIXACL: #ifdef CONFIG_FS_POSIX_ACL zfsvfs->z_acl_type = ZFS_ACLTYPE_POSIXACL; zfsvfs->z_sb->s_flags |= SB_POSIXACL; #else zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; #endif /* CONFIG_FS_POSIX_ACL */ break; default: break; } } static void blksz_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); ASSERT(ISP2(newval)); zfsvfs->z_max_blksz = newval; } static void readonly_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; struct super_block *sb = zfsvfs->z_sb; if (sb == NULL) return; if (newval) sb->s_flags |= SB_RDONLY; else sb->s_flags &= ~SB_RDONLY; } static void devices_changed_cb(void *arg, uint64_t newval) { } static void setuid_changed_cb(void *arg, uint64_t newval) { } static void exec_changed_cb(void *arg, uint64_t newval) { } static void nbmand_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; struct super_block *sb = zfsvfs->z_sb; if (sb == NULL) return; if (newval == TRUE) sb->s_flags |= SB_MANDLOCK; else sb->s_flags &= ~SB_MANDLOCK; } static void snapdir_changed_cb(void *arg, uint64_t newval) { ((zfsvfs_t *)arg)->z_show_ctldir = newval; } static void vscan_changed_cb(void *arg, uint64_t newval) { ((zfsvfs_t *)arg)->z_vscan = newval; } static void acl_mode_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_mode = newval; } static void acl_inherit_changed_cb(void *arg, uint64_t newval) { ((zfsvfs_t *)arg)->z_acl_inherit = newval; } static int zfs_register_callbacks(vfs_t *vfsp) { struct dsl_dataset *ds = NULL; objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; int error = 0; ASSERT(vfsp); zfsvfs = vfsp->vfs_data; ASSERT(zfsvfs); os = zfsvfs->z_os; /* * The act of registering our callbacks will destroy any mount * options we may have. In order to enable temporary overrides * of mount options, we stash away the current values and * restore them after we register the callbacks. */ if (zfs_is_readonly(zfsvfs) || !spa_writeable(dmu_objset_spa(os))) { vfsp->vfs_do_readonly = B_TRUE; vfsp->vfs_readonly = B_TRUE; } /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ ds = dmu_objset_ds(os); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); error = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RELATIME), relatime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLTYPE), acltype_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zfsvfs); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; /* * Invoke our callbacks to restore temporary mount options. */ if (vfsp->vfs_do_readonly) readonly_changed_cb(zfsvfs, vfsp->vfs_readonly); if (vfsp->vfs_do_setuid) setuid_changed_cb(zfsvfs, vfsp->vfs_setuid); if (vfsp->vfs_do_exec) exec_changed_cb(zfsvfs, vfsp->vfs_exec); if (vfsp->vfs_do_devices) devices_changed_cb(zfsvfs, vfsp->vfs_devices); if (vfsp->vfs_do_xattr) xattr_changed_cb(zfsvfs, vfsp->vfs_xattr); if (vfsp->vfs_do_atime) atime_changed_cb(zfsvfs, vfsp->vfs_atime); if (vfsp->vfs_do_relatime) relatime_changed_cb(zfsvfs, vfsp->vfs_relatime); if (vfsp->vfs_do_nbmand) nbmand_changed_cb(zfsvfs, vfsp->vfs_nbmand); return (0); unregister: dsl_prop_unregister_all(ds, zfsvfs); return (error); } /* * Takes a dataset, a property, a value and that value's setpoint as * found in the ZAP. Checks if the property has been changed in the vfs. * If so, val and setpoint will be overwritten with updated content. * Otherwise, they are left unchanged. */ int zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint) { int error; zfsvfs_t *zfvp; vfs_t *vfsp; objset_t *os; uint64_t tmp = *val; error = dmu_objset_from_ds(ds, &os); if (error != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) return (EINVAL); mutex_enter(&os->os_user_ptr_lock); zfvp = dmu_objset_get_user(os); mutex_exit(&os->os_user_ptr_lock); if (zfvp == NULL) return (ESRCH); vfsp = zfvp->z_vfs; switch (zfs_prop) { case ZFS_PROP_ATIME: if (vfsp->vfs_do_atime) tmp = vfsp->vfs_atime; break; case ZFS_PROP_RELATIME: if (vfsp->vfs_do_relatime) tmp = vfsp->vfs_relatime; break; case ZFS_PROP_DEVICES: if (vfsp->vfs_do_devices) tmp = vfsp->vfs_devices; break; case ZFS_PROP_EXEC: if (vfsp->vfs_do_exec) tmp = vfsp->vfs_exec; break; case ZFS_PROP_SETUID: if (vfsp->vfs_do_setuid) tmp = vfsp->vfs_setuid; break; case ZFS_PROP_READONLY: if (vfsp->vfs_do_readonly) tmp = vfsp->vfs_readonly; break; case ZFS_PROP_XATTR: if (vfsp->vfs_do_xattr) tmp = vfsp->vfs_xattr; break; case ZFS_PROP_NBMAND: if (vfsp->vfs_do_nbmand) tmp = vfsp->vfs_nbmand; break; default: return (ENOENT); } if (tmp != *val) { (void) strcpy(setpoint, "temporary"); *val = tmp; } return (0); } /* * Associate this zfsvfs with the given objset, which must be owned. * This will cache a bunch of on-disk state from the objset in the * zfsvfs. */ static int zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) { int error; uint64_t val; zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; zfsvfs->z_os = os; error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); if (error != 0) return (error); if (zfsvfs->z_version > zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { (void) printk("Can't mount a version %lld file system " "on a version %lld pool\n. Pool must be upgraded to mount " "this file system.\n", (u_longlong_t)zfsvfs->z_version, (u_longlong_t)spa_version(dmu_objset_spa(os))); return (SET_ERROR(ENOTSUP)); } error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); if (error != 0) return (error); zfsvfs->z_norm = (int)val; error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); if (error != 0) return (error); zfsvfs->z_utf8 = (val != 0); error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); if (error != 0) return (error); zfsvfs->z_case = (uint_t)val; if ((error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val)) != 0) return (error); zfsvfs->z_acl_type = (uint_t)val; /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || zfsvfs->z_case == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); uint64_t sa_obj = 0; if (zfsvfs->z_use_sa) { /* should either have both of these objects or none */ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0) return (error); error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val); if ((error == 0) && (val == ZFS_XATTR_SA)) zfsvfs->z_xattr_sa = B_TRUE; } error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); if (error != 0) return (error); ASSERT(zfsvfs->z_root != 0); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &zfsvfs->z_unlinkedobj); if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 8, 1, &zfsvfs->z_userquota_obj); if (error == ENOENT) zfsvfs->z_userquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 8, 1, &zfsvfs->z_groupquota_obj); if (error == ENOENT) zfsvfs->z_groupquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], 8, 1, &zfsvfs->z_projectquota_obj); if (error == ENOENT) zfsvfs->z_projectquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], 8, 1, &zfsvfs->z_userobjquota_obj); if (error == ENOENT) zfsvfs->z_userobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], 8, 1, &zfsvfs->z_groupobjquota_obj); if (error == ENOENT) zfsvfs->z_groupobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], 8, 1, &zfsvfs->z_projectobjquota_obj); if (error == ENOENT) zfsvfs->z_projectobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); if (error == ENOENT) zfsvfs->z_fuid_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &zfsvfs->z_shares_dir); if (error == ENOENT) zfsvfs->z_shares_dir = 0; else if (error != 0) return (error); error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); if (error != 0) return (error); if (zfsvfs->z_version >= ZPL_VERSION_SA) sa_register_update_callback(os, zfs_sa_upgrade); return (0); } int zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) { objset_t *os; zfsvfs_t *zfsvfs; int error; boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os); if (error != 0) { kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } error = zfsvfs_create_impl(zfvp, zfsvfs, os); if (error != 0) { dmu_objset_disown(os, B_TRUE, zfsvfs); } return (error); } /* * Note: zfsvfs is assumed to be malloc'd, and will be freed by this function * on a failure. Do not pass in a statically allocated zfsvfs. */ int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) { int error; zfsvfs->z_vfs = NULL; zfsvfs->z_sb = NULL; zfsvfs->z_parent = zfsvfs; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); int size = MIN(1 << (highbit64(zfs_object_mutex_size) - 1), ZFS_OBJ_MTX_MAX); zfsvfs->z_hold_size = size; zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, KM_SLEEP); zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); for (int i = 0; i != size; i++) { avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); } error = zfsvfs_init(zfsvfs, os); if (error != 0) { *zfvp = NULL; zfsvfs_free(zfsvfs); return (error); } zfsvfs->z_drain_task = TASKQID_INVALID; zfsvfs->z_draining = B_FALSE; zfsvfs->z_drain_cancel = B_TRUE; *zfvp = zfsvfs; return (0); } static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) { int error; boolean_t readonly = zfs_is_readonly(zfsvfs); error = zfs_register_callbacks(zfsvfs->z_vfs); if (error) return (error); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); /* * If we are not mounting (ie: online recv), then we don't * have to worry about replaying the log as we blocked all * operations out since we closed the ZIL. */ if (mounting) { ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); /* * During replay we remove the read only flag to * allow replays to succeed. */ if (readonly != 0) { readonly_changed_cb(zfsvfs, B_FALSE); } else { zap_stats_t zs; if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, &zs) == 0) { dataset_kstats_update_nunlinks_kstat( &zfsvfs->z_kstat, zs.zs_num_entries); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, "num_entries in unlinked set: %llu", zs.zs_num_entries); } zfs_unlinked_drain(zfsvfs); dsl_dir_t *dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; dd->dd_activity_cancelled = B_FALSE; } /* * Parse and replay the intent log. * * Because of ziltest, this must be done after * zfs_unlinked_drain(). (Further note: ziltest * doesn't use readonly mounts, where * zfs_unlinked_drain() isn't called.) This is because * ziltest causes spa_sync() to think it's committed, * but actually it is not, so the intent log contains * many txg's worth of changes. * * In particular, if object N is in the unlinked set in * the last txg to actually sync, then it could be * actually freed in a later txg and then reallocated * in a yet later txg. This would write a "create * object N" record to the intent log. Normally, this * would be fine because the spa_sync() would have * written out the fact that object N is free, before * we could write the "create object N" intent log * record. * * But when we are in ziltest mode, we advance the "open * txg" without actually spa_sync()-ing the changes to * disk. So we would see that object N is still * allocated and in the unlinked set, and there is an * intent log record saying to allocate it. */ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { if (zil_replay_disable) { zil_destroy(zfsvfs->z_log, B_FALSE); } else { zfsvfs->z_replay = B_TRUE; zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); zfsvfs->z_replay = B_FALSE; } } /* restore readonly bit */ if (readonly != 0) readonly_changed_cb(zfsvfs, B_TRUE); } /* * Set the objset user_ptr to track its zfsvfs. */ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); return (0); } void zfsvfs_free(zfsvfs_t *zfsvfs) { int i, size = zfsvfs->z_hold_size; zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); rrm_destroy(&zfsvfs->z_teardown_lock); rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_fuid_lock); for (i = 0; i != size; i++) { avl_destroy(&zfsvfs->z_hold_trees[i]); mutex_destroy(&zfsvfs->z_hold_locks[i]); } vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); zfsvfs_vfs_free(zfsvfs->z_vfs); dataset_kstats_destroy(&zfsvfs->z_kstat); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) { zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } static void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) { objset_t *os = zfsvfs->z_os; if (!dmu_objset_is_snapshot(os)) dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); } #ifdef HAVE_MLSLABEL /* * Check that the hex label string is appropriate for the dataset being * mounted into the global_zone proper. * * Return an error if the hex label string is not default or * admin_low/admin_high. For admin_low labels, the corresponding * dataset must be readonly. */ int zfs_check_global_label(const char *dsname, const char *hexsl) { if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) return (0); if (strcasecmp(hexsl, ADMIN_HIGH) == 0) return (0); if (strcasecmp(hexsl, ADMIN_LOW) == 0) { /* must be readonly */ uint64_t rdonly; if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) return (SET_ERROR(EACCES)); return (rdonly ? 0 : SET_ERROR(EACCES)); } return (SET_ERROR(EACCES)); } #endif /* HAVE_MLSLABEL */ static int zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct kstatfs *statp, uint32_t bshift) { char buf[20 + DMU_OBJACCT_PREFIX_LEN]; uint64_t offset = DMU_OBJACCT_PREFIX_LEN; uint64_t quota; uint64_t used; int err; strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1); err = zfs_id_to_fuidstr(zfsvfs, NULL, zp->z_projid, buf + offset, sizeof (buf) - offset, B_FALSE); if (err) return (err); if (zfsvfs->z_projectquota_obj == 0) goto objs; err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj, buf + offset, 8, 1, "a); if (err == ENOENT) goto objs; else if (err) return (err); err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, buf + offset, 8, 1, &used); if (unlikely(err == ENOENT)) { uint32_t blksize; u_longlong_t nblocks; /* * Quota accounting is async, so it is possible race case. * There is at least one object with the given project ID. */ sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); if (unlikely(zp->z_blksz == 0)) blksize = zfsvfs->z_max_blksz; used = blksize * nblocks; } else if (err) { return (err); } statp->f_blocks = quota >> bshift; statp->f_bfree = (quota > used) ? ((quota - used) >> bshift) : 0; statp->f_bavail = statp->f_bfree; objs: if (zfsvfs->z_projectobjquota_obj == 0) return (0); err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj, buf + offset, 8, 1, "a); if (err == ENOENT) return (0); else if (err) return (err); err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, buf, 8, 1, &used); if (unlikely(err == ENOENT)) { /* * Quota accounting is async, so it is possible race case. * There is at least one object with the given project ID. */ used = 1; } else if (err) { return (err); } statp->f_files = quota; statp->f_ffree = (quota > used) ? (quota - used) : 0; return (0); } int zfs_statvfs(struct inode *ip, struct kstatfs *statp) { zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t refdbytes, availbytes, usedobjs, availobjs; int err = 0; ZFS_ENTER(zfsvfs); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); uint64_t fsid = dmu_objset_fsid_guid(zfsvfs->z_os); /* * The underlying storage pool actually uses multiple block * size. Under Solaris frsize (fragment size) is reported as * the smallest block size we support, and bsize (block size) * as the filesystem's maximum block size. Unfortunately, * under Linux the fragment size and block size are often used * interchangeably. Thus we are forced to report both of them * as the filesystem's maximum block size. */ statp->f_frsize = zfsvfs->z_max_blksz; statp->f_bsize = zfsvfs->z_max_blksz; uint32_t bshift = fls(statp->f_bsize) - 1; /* * The following report "total" blocks of various kinds in * the file system, but reported in terms of f_bsize - the * "preferred" size. */ /* Round up so we never have a filesystem using 0 blocks. */ refdbytes = P2ROUNDUP(refdbytes, statp->f_bsize); statp->f_blocks = (refdbytes + availbytes) >> bshift; statp->f_bfree = availbytes >> bshift; statp->f_bavail = statp->f_bfree; /* no root reservation */ /* * statvfs() should really be called statufs(), because it assumes * static metadata. ZFS doesn't preallocate files, so the best * we can do is report the max that could possibly fit in f_files, * and that minus the number actually used in f_ffree. * For f_ffree, report the smaller of the number of objects available * and the number of blocks (each object will take at least a block). */ statp->f_ffree = MIN(availobjs, availbytes >> DNODE_SHIFT); statp->f_files = statp->f_ffree + usedobjs; statp->f_fsid.val[0] = (uint32_t)fsid; statp->f_fsid.val[1] = (uint32_t)(fsid >> 32); statp->f_type = ZFS_SUPER_MAGIC; statp->f_namelen = MAXNAMELEN - 1; /* * We have all of 40 characters to stuff a string here. * Is there anything useful we could/should provide? */ bzero(statp->f_spare, sizeof (statp->f_spare)); if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && dmu_objset_projectquota_present(zfsvfs->z_os)) { znode_t *zp = ITOZ(ip); if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid && zpl_is_valid_projid(zp->z_projid)) err = zfs_statfs_project(zfsvfs, zp, statp, bshift); } ZFS_EXIT(zfsvfs); return (err); } static int zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) { znode_t *rootzp; int error; ZFS_ENTER(zfsvfs); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *ipp = ZTOI(rootzp); ZFS_EXIT(zfsvfs); return (error); } /* * Linux kernels older than 3.1 do not support a per-filesystem shrinker. * To accommodate this we must improvise and manually walk the list of znodes * attempting to prune dentries in order to be able to drop the inodes. * * To avoid scanning the same znodes multiple times they are always rotated * to the end of the z_all_znodes list. New znodes are inserted at the * end of the list so we're always scanning the oldest znodes first. */ static int zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) { znode_t **zp_array, *zp; int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *)); int objects = 0; int i = 0, j = 0; zp_array = kmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); mutex_enter(&zfsvfs->z_znodes_lock); while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) { if ((i++ > nr_to_scan) || (j >= max_array)) break; ASSERT(list_link_active(&zp->z_link_node)); list_remove(&zfsvfs->z_all_znodes, zp); list_insert_tail(&zfsvfs->z_all_znodes, zp); /* Skip active znodes and .zfs entries */ if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir) continue; if (igrab(ZTOI(zp)) == NULL) continue; zp_array[j] = zp; j++; } mutex_exit(&zfsvfs->z_znodes_lock); for (i = 0; i < j; i++) { zp = zp_array[i]; ASSERT3P(zp, !=, NULL); d_prune_aliases(ZTOI(zp)); if (atomic_read(&ZTOI(zp)->i_count) == 1) objects++; zrele(zp); } kmem_free(zp_array, max_array * sizeof (znode_t *)); return (objects); } /* * The ARC has requested that the filesystem drop entries from the dentry * and inode caches. This can occur when the ARC needs to free meta data * blocks but can't because they are all pinned by entries in these caches. */ int zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) { zfsvfs_t *zfsvfs = sb->s_fs_info; int error = 0; struct shrinker *shrinker = &sb->s_shrink; struct shrink_control sc = { .nr_to_scan = nr_to_scan, .gfp_mask = GFP_KERNEL, }; ZFS_ENTER(zfsvfs); #if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \ defined(SHRINK_CONTROL_HAS_NID) && \ defined(SHRINKER_NUMA_AWARE) if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) { *objects = 0; for_each_online_node(sc.nid) { *objects += (*shrinker->scan_objects)(shrinker, &sc); } } else { *objects = (*shrinker->scan_objects)(shrinker, &sc); } #elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) *objects = (*shrinker->scan_objects)(shrinker, &sc); #elif defined(HAVE_SINGLE_SHRINKER_CALLBACK) *objects = (*shrinker->shrink)(shrinker, &sc); #elif defined(HAVE_D_PRUNE_ALIASES) #define D_PRUNE_ALIASES_IS_DEFAULT *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); #else #error "No available dentry and inode cache pruning mechanism." #endif #if defined(HAVE_D_PRUNE_ALIASES) && !defined(D_PRUNE_ALIASES_IS_DEFAULT) #undef D_PRUNE_ALIASES_IS_DEFAULT /* * Fall back to zfs_prune_aliases if the kernel's per-superblock * shrinker couldn't free anything, possibly due to the inodes being * allocated in a different memcg. */ if (*objects == 0) *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); #endif ZFS_EXIT(zfsvfs); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, "pruning, nr_to_scan=%lu objects=%d error=%d\n", nr_to_scan, *objects, error); return (error); } /* * Teardown the zfsvfs_t. * * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' * and 'z_teardown_inactive_lock' held. */ static int zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) { znode_t *zp; zfs_unlinked_drain_stop_wait(zfsvfs); /* * If someone has not already unmounted this file system, * drain the zrele_taskq to ensure all active references to the * zfsvfs_t have been handled only then can it be safely destroyed. */ if (zfsvfs->z_os) { /* * If we're unmounting we have to wait for the list to * drain completely. * * If we're not unmounting there's no guarantee the list * will drain completely, but iputs run from the taskq * may add the parents of dir-based xattrs to the taskq * so we want to wait for these. * * We can safely read z_nr_znodes without locking because the * VFS has already blocked operations which add to the * z_all_znodes list and thus increment z_nr_znodes. */ int round = 0; while (zfsvfs->z_nr_znodes > 0) { taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); if (++round > 1 && !unmounting) break; } } rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); if (!unmounting) { /* * We purge the parent filesystem's super block as the * parent filesystem and all of its snapshots have their * inode's super block set to the parent's filesystem's * super block. Note, 'z_parent' is self referential * for non-snapshots. */ shrink_dcache_sb(zfsvfs->z_parent->z_sb); } /* * Close the zil. NB: Can't close the zil while zfs_inactive * threads are blocked as zil_close can call zfs_inactive. */ if (zfsvfs->z_log) { zil_close(zfsvfs->z_log); zfsvfs->z_log = NULL; } rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); /* * If we are not unmounting (ie: online recv) and someone already * unmounted this file system while we were doing the switcheroo, * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); return (SET_ERROR(EIO)); } /* * At this point there are no VFS ops active, and any new VFS ops * will fail with EIO since we have z_teardown_lock for writer (only * relevant for forced unmount). * * Release all holds on dbufs. We also grab an extra reference to all * the remaining inodes so that the kernel does not attempt to free * any inodes of a suspended fs. This can cause deadlocks since the * zfs_resume_fs() process may involve starting threads, which might * attempt to free unreferenced inodes to free up memory for the new * thread. */ if (!unmounting) { mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) { if (zp->z_sa_hdl) zfs_znode_dmu_fini(zp); if (igrab(ZTOI(zp)) != NULL) zp->z_suspended = B_TRUE; } mutex_exit(&zfsvfs->z_znodes_lock); } /* * If we are unmounting, set the unmounted flag and let new VFS ops * unblock. zfs_inactive will have the unmounted behavior, and all * other VFS ops will fail with EIO. */ if (unmounting) { zfsvfs->z_unmounted = B_TRUE; rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); } /* * z_os will be NULL if there was an error in attempting to reopen * zfsvfs, so just return as the properties had already been * * unregistered and cached data had been evicted before. */ if (zfsvfs->z_os == NULL) return (0); /* * Unregister properties. */ zfs_unregister_callbacks(zfsvfs); /* * Evict cached data. We must write out any dirty data before * disowning the dataset. */ objset_t *os = zfsvfs->z_os; boolean_t os_dirty = B_FALSE; for (int t = 0; t < TXG_SIZE; t++) { if (dmu_objset_is_dirty(os, t)) { os_dirty = B_TRUE; break; } } if (!zfs_is_readonly(zfsvfs) && os_dirty) { txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); } dmu_objset_evict_dbufs(zfsvfs->z_os); dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; dsl_dir_cancel_waiters(dd); return (0); } #if defined(HAVE_SUPER_SETUP_BDI_NAME) atomic_long_t zfs_bdi_seq = ATOMIC_LONG_INIT(0); #endif int zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) { const char *osname = zm->mnt_osname; struct inode *root_inode; uint64_t recordsize; int error = 0; zfsvfs_t *zfsvfs = NULL; vfs_t *vfs = NULL; ASSERT(zm); ASSERT(osname); error = zfsvfs_parse_options(zm->mnt_data, &vfs); if (error) return (error); error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs); if (error) { zfsvfs_vfs_free(vfs); goto out; } if ((error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL))) { zfsvfs_vfs_free(vfs); goto out; } vfs->vfs_data = zfsvfs; zfsvfs->z_vfs = vfs; zfsvfs->z_sb = sb; sb->s_fs_info = zfsvfs; sb->s_magic = ZFS_SUPER_MAGIC; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_time_gran = 1; sb->s_blocksize = recordsize; sb->s_blocksize_bits = ilog2(recordsize); error = -zpl_bdi_setup(sb, "zfs"); if (error) goto out; sb->s_bdi->ra_pages = 0; /* Set callback operations for the file system. */ sb->s_op = &zpl_super_operations; sb->s_xattr = zpl_xattr_handlers; sb->s_export_op = &zpl_export_operations; sb->s_d_op = &zpl_dentry_operations; /* Set features for file system. */ zfs_set_fuid_feature(zfsvfs); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if ((error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))) goto out; xattr_changed_cb(zfsvfs, pval); if ((error = dsl_prop_get_integer(osname, "acltype", &pval, NULL))) goto out; acltype_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; zfsvfs->z_snap_defer_time = jiffies; mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); } else { if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) goto out; } /* Allocate a root inode for the filesystem. */ error = zfs_root(zfsvfs, &root_inode); if (error) { (void) zfs_umount(sb); goto out; } /* Allocate a root dentry for the filesystem */ sb->s_root = d_make_root(root_inode); if (sb->s_root == NULL) { (void) zfs_umount(sb); error = SET_ERROR(ENOMEM); goto out; } if (!zfsvfs->z_issnap) zfsctl_create(zfsvfs); zfsvfs->z_arc_prune = arc_add_prune_callback(zpl_prune_sb, sb); out: if (error) { if (zfsvfs != NULL) { dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } /* * make sure we don't have dangling sb->s_fs_info which * zfs_preumount will use. */ sb->s_fs_info = NULL; } return (error); } /* * Called when an unmount is requested and certain sanity checks have * already passed. At this point no dentries or inodes have been reclaimed * from their respective caches. We drop the extra reference on the .zfs * control directory to allow everything to be reclaimed. All snapshots * must already have been unmounted to reach this point. */ void zfs_preumount(struct super_block *sb) { zfsvfs_t *zfsvfs = sb->s_fs_info; /* zfsvfs is NULL when zfs_domount fails during mount */ if (zfsvfs) { zfs_unlinked_drain_stop_wait(zfsvfs); zfsctl_destroy(sb->s_fs_info); /* * Wait for zrele_async before entering evict_inodes in * generic_shutdown_super. The reason we must finish before * evict_inodes is when lazytime is on, or when zfs_purgedir * calls zfs_zget, zrele would bump i_count from 0 to 1. This * would race with the i_count check in evict_inodes. This means * it could destroy the inode while we are still using it. * * We wait for two passes. xattr directories in the first pass * may add xattr entries in zfs_purgedir, so in the second pass * we wait for them. We don't use taskq_wait here because it is * a pool wide taskq. Other mounted filesystems can constantly * do zrele_async and there's no guarantee when taskq will be * empty. */ taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); } } /* * Called once all other unmount released tear down has occurred. * It is our responsibility to release any remaining infrastructure. */ /*ARGSUSED*/ int zfs_umount(struct super_block *sb) { zfsvfs_t *zfsvfs = sb->s_fs_info; objset_t *os; if (zfsvfs->z_arc_prune != NULL) arc_remove_prune_callback(zfsvfs->z_arc_prune); VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); os = zfsvfs->z_os; zpl_bdi_destroy(sb); /* * z_os will be NULL if there was an error in * attempting to reopen zfsvfs. */ if (os != NULL) { /* * Unset the objset user_ptr. */ mutex_enter(&os->os_user_ptr_lock); dmu_objset_set_user(os, NULL); mutex_exit(&os->os_user_ptr_lock); /* * Finally release the objset */ dmu_objset_disown(os, B_TRUE, zfsvfs); } zfsvfs_free(zfsvfs); return (0); } int zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm) { zfsvfs_t *zfsvfs = sb->s_fs_info; vfs_t *vfsp; boolean_t issnap = dmu_objset_is_snapshot(zfsvfs->z_os); int error; if ((issnap || !spa_writeable(dmu_objset_spa(zfsvfs->z_os))) && !(*flags & SB_RDONLY)) { *flags |= SB_RDONLY; return (EROFS); } error = zfsvfs_parse_options(zm->mnt_data, &vfsp); if (error) return (error); if (!zfs_is_readonly(zfsvfs) && (*flags & SB_RDONLY)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); zfs_unregister_callbacks(zfsvfs); zfsvfs_vfs_free(zfsvfs->z_vfs); vfsp->vfs_data = zfsvfs; zfsvfs->z_vfs = vfsp; if (!issnap) (void) zfs_register_callbacks(vfsp); return (error); } int zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) { zfsvfs_t *zfsvfs = sb->s_fs_info; znode_t *zp; uint64_t object = 0; uint64_t fid_gen = 0; uint64_t gen_mask; uint64_t zp_gen; int i, err; *ipp = NULL; if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { zfid_short_t *zfid = (zfid_short_t *)fidp; for (i = 0; i < sizeof (zfid->zf_object); i++) object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); for (i = 0; i < sizeof (zfid->zf_gen); i++) fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { return (SET_ERROR(EINVAL)); } /* LONG_FID_LEN means snapdirs */ if (fidp->fid_len == LONG_FID_LEN) { zfid_long_t *zlfid = (zfid_long_t *)fidp; uint64_t objsetid = 0; uint64_t setgen = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); if (objsetid != ZFSCTL_INO_SNAPDIRS - object) { dprintf("snapdir fid: objsetid (%llu) != " "ZFSCTL_INO_SNAPDIRS (%llu) - object (%llu)\n", objsetid, ZFSCTL_INO_SNAPDIRS, object); return (SET_ERROR(EINVAL)); } if (fid_gen > 1 || setgen != 0) { dprintf("snapdir fid: fid_gen (%llu) and setgen " "(%llu)\n", fid_gen, setgen); return (SET_ERROR(EINVAL)); } return (zfsctl_snapdir_vget(sb, objsetid, fid_gen, ipp)); } ZFS_ENTER(zfsvfs); /* A zero fid_gen means we are in the .zfs control directories */ if (fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { *ipp = zfsvfs->z_ctldir; ASSERT(*ipp != NULL); if (object == ZFSCTL_INO_SNAPDIR) { VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp, 0, kcred, NULL, NULL) == 0); } else { igrab(*ipp); } ZFS_EXIT(zfsvfs); return (0); } gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%llu mask %llx]\n", object, fid_gen, gen_mask); if ((err = zfs_zget(zfsvfs, object, &zp))) { ZFS_EXIT(zfsvfs); return (err); } /* Don't export xattr stuff */ if (zp->z_pflags & ZFS_XATTR) { zrele(zp); ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOENT)); } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, sizeof (uint64_t)); zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if ((fid_gen == 0) && (zfsvfs->z_root == object)) fid_gen = zp_gen; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen, fid_gen); zrele(zp); ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOENT)); } *ipp = ZTOI(zp); if (*ipp) zfs_inode_update(ITOZ(*ipp)); ZFS_EXIT(zfsvfs); return (0); } /* * Block out VFS ops and close zfsvfs_t * * Note, if successful, then we return with the 'z_teardown_lock' and * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying * dataset and objset intact so that they can be atomically handed off during * a subsequent rollback or recv operation and the resume thereafter. */ int zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); return (0); } /* * Rebuild SA and release VOPs. Note that ownership of the underlying dataset * is an invariant across any of the operations that can be performed while the * filesystem was suspended. Whether it succeeded or failed, the preconditions * are the same: the relevant objset and associated dataset are owned by * zfsvfs, held, and long held on entry. */ int zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { int err, err2; znode_t *zp; ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); /* * We already own this, so just update the objset_t, as the one we * had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_config_exit(dp, FTAG); err = zfsvfs_init(zfsvfs, os); if (err != 0) goto bail; ds->ds_dir->dd_activity_cancelled = B_FALSE; VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); zfs_set_fuid_feature(zfsvfs); zfsvfs->z_rollback_time = jiffies; /* * Attempt to re-establish all the active inodes with their * dbufs. If a zfs_rezget() fails, then we unhash the inode * and mark it stale. This prevents a collision if a new * inode/object is created which must use the same inode * number. The stale inode will be be released when the * VFS prunes the dentry holding the remaining references * on the stale inode. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = list_next(&zfsvfs->z_all_znodes, zp)) { err2 = zfs_rezget(zp); if (err2) { remove_inode_hash(ZTOI(zp)); zp->z_is_stale = B_TRUE; } /* see comment in zfs_suspend_fs() */ if (zp->z_suspended) { zfs_zrele_async(zp); zp->z_suspended = B_FALSE; } } mutex_exit(&zfsvfs->z_znodes_lock); if (!zfs_is_readonly(zfsvfs) && !zfsvfs->z_unmounted) { /* * zfs_suspend_fs() could have interrupted freeing * of dnodes. We need to restart this freeing so * that we don't "leak" the space. */ zfs_unlinked_drain(zfsvfs); } /* * Most of the time zfs_suspend_fs is used for changing the contents * of the underlying dataset. ZFS rollback and receive operations * might create files for which negative dentries are present in * the cache. Since walking the dcache would require a lot of GPL-only * code duplication, it's much easier on these rather rare occasions * just to flush the whole dcache for the given dataset/filesystem. */ shrink_dcache_sb(zfsvfs->z_sb); bail: if (err != 0) zfsvfs->z_unmounted = B_TRUE; /* release the VFS ops */ rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); if (err != 0) { /* * Since we couldn't setup the sa framework, try to force * unmount this file system. */ if (zfsvfs->z_os) (void) zfs_umount(zfsvfs->z_sb); } return (err); } /* * Release VOPs and unmount a suspended filesystem. */ int zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); /* * We already own this, so just hold and rele it to update the * objset_t, as the one we had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_config_exit(dp, FTAG); zfsvfs->z_os = os; /* release the VOPs */ rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); /* * Try to force unmount this file system. */ (void) zfs_umount(zfsvfs->z_sb); zfsvfs->z_unmounted = B_TRUE; return (0); } /* * Automounted snapshots rely on periodic revalidation * to defer snapshots from being automatically unmounted. */ inline void zfs_exit_fs(zfsvfs_t *zfsvfs) { if (!zfsvfs->z_issnap) return; if (time_after(jiffies, zfsvfs->z_snap_defer_time + MAX(zfs_expire_snapshot * HZ / 2, HZ))) { zfsvfs->z_snap_defer_time = jiffies; zfsctl_snapshot_unmount_delay(zfsvfs->z_os->os_spa, dmu_objset_id(zfsvfs->z_os), zfs_expire_snapshot); } } int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { int error; objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) return (SET_ERROR(EINVAL)); if (newvers < zfsvfs->z_version) return (SET_ERROR(EINVAL)); if (zfs_spa_version_map(newvers) > spa_version(dmu_objset_spa(zfsvfs->z_os))) return (SET_ERROR(ENOTSUP)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, ZFS_SA_ATTRS); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &newvers, tx); if (error) { dmu_tx_commit(tx); return (error); } if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { uint64_t sa_obj; ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, SPA_VERSION_SA); sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); VERIFY(0 == sa_set_sa_object(os, sa_obj)); sa_register_update_callback(os, zfs_sa_upgrade); } spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, "from %llu to %llu", zfsvfs->z_version, newvers); dmu_tx_commit(tx); zfsvfs->z_version = newvers; os->os_version = newvers; zfs_set_fuid_feature(zfsvfs); return (0); } /* * Read a property stored within the master node. */ int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) { uint64_t *cached_copy = NULL; /* * Figure out where in the objset_t the cached copy would live, if it * is available for the requested property. */ if (os != NULL) { switch (prop) { case ZFS_PROP_VERSION: cached_copy = &os->os_version; break; case ZFS_PROP_NORMALIZE: cached_copy = &os->os_normalization; break; case ZFS_PROP_UTF8ONLY: cached_copy = &os->os_utf8only; break; case ZFS_PROP_CASE: cached_copy = &os->os_casesensitivity; break; default: break; } } if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { *value = *cached_copy; return (0); } /* * If the property wasn't cached, look up the file system's value for * the property. For the version property, we look up a slightly * different string. */ const char *pname; int error = ENOENT; if (prop == ZFS_PROP_VERSION) pname = ZPL_VERSION_STR; else pname = zfs_prop_to_name(prop); if (os != NULL) { ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); } if (error == ENOENT) { /* No value set, use the default value */ switch (prop) { case ZFS_PROP_VERSION: *value = ZPL_VERSION; break; case ZFS_PROP_NORMALIZE: case ZFS_PROP_UTF8ONLY: *value = 0; break; case ZFS_PROP_CASE: *value = ZFS_CASE_SENSITIVE; break; case ZFS_PROP_ACLTYPE: *value = ZFS_ACLTYPE_OFF; break; default: return (error); } error = 0; } /* * If one of the methods for getting the property value above worked, * copy it into the objset_t's cache. */ if (error == 0 && cached_copy != NULL) { *cached_copy = *value; } return (error); } /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. * If this function returns true we know VFS unmount has been initiated. */ boolean_t zfs_get_vfs_flag_unmounted(objset_t *os) { zfsvfs_t *zfvp; boolean_t unmounted = B_FALSE; ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); mutex_enter(&os->os_user_ptr_lock); zfvp = dmu_objset_get_user(os); if (zfvp != NULL && zfvp->z_unmounted) unmounted = B_TRUE; mutex_exit(&os->os_user_ptr_lock); return (unmounted); } +/*ARGSUSED*/ +void +zfsvfs_update_fromname(const char *oldname, const char *newname) +{ + /* + * We don't need to do anything here, the devname is always current by + * virtue of zfsvfs->z_sb->s_op->show_devname. + */ +} + void zfs_init(void) { zfsctl_init(); zfs_znode_init(); dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); register_filesystem(&zpl_fs_type); } void zfs_fini(void) { /* * we don't use outstanding because zpl_posix_acl_free might add more. */ taskq_wait(system_delay_taskq); taskq_wait(system_taskq); unregister_filesystem(&zpl_fs_type); zfs_znode_fini(); zfsctl_fini(); } #if defined(_KERNEL) EXPORT_SYMBOL(zfs_suspend_fs); EXPORT_SYMBOL(zfs_resume_fs); EXPORT_SYMBOL(zfs_set_version); EXPORT_SYMBOL(zfsvfs_create); EXPORT_SYMBOL(zfsvfs_free); EXPORT_SYMBOL(zfs_is_readonly); EXPORT_SYMBOL(zfs_domount); EXPORT_SYMBOL(zfs_preumount); EXPORT_SYMBOL(zfs_umount); EXPORT_SYMBOL(zfs_remount); EXPORT_SYMBOL(zfs_statvfs); EXPORT_SYMBOL(zfs_vget); EXPORT_SYMBOL(zfs_prune); #endif Index: vendor-sys/openzfs/dist/module/os/linux/zfs/zpl_super.c =================================================================== --- vendor-sys/openzfs/dist/module/os/linux/zfs/zpl_super.c (revision 365339) +++ vendor-sys/openzfs/dist/module/os/linux/zfs/zpl_super.c (revision 365340) @@ -1,326 +1,346 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. */ #include #include #include #include #include static struct inode * zpl_inode_alloc(struct super_block *sb) { struct inode *ip; VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0); inode_set_iversion(ip, 1); return (ip); } static void zpl_inode_destroy(struct inode *ip) { ASSERT(atomic_read(&ip->i_count) == 0); zfs_inode_destroy(ip); } /* * Called from __mark_inode_dirty() to reflect that something in the * inode has changed. We use it to ensure the znode system attributes * are always strictly update to date with respect to the inode. */ #ifdef HAVE_DIRTY_INODE_WITH_FLAGS static void zpl_dirty_inode(struct inode *ip, int flags) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); zfs_dirty_inode(ip, flags); spl_fstrans_unmark(cookie); } #else static void zpl_dirty_inode(struct inode *ip) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); zfs_dirty_inode(ip, 0); spl_fstrans_unmark(cookie); } #endif /* HAVE_DIRTY_INODE_WITH_FLAGS */ /* * When ->drop_inode() is called its return value indicates if the * inode should be evicted from the inode cache. If the inode is * unhashed and has no links the default policy is to evict it * immediately. * * The ->evict_inode() callback must minimally truncate the inode pages, * and call clear_inode(). For 2.6.35 and later kernels this will * simply update the inode state, with the sync occurring before the * truncate in evict(). For earlier kernels clear_inode() maps to * end_writeback() which is responsible for completing all outstanding * write back. In either case, once this is done it is safe to cleanup * any remaining inode specific data via zfs_inactive(). * remaining filesystem specific data. */ static void zpl_evict_inode(struct inode *ip) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); truncate_setsize(ip, 0); clear_inode(ip); zfs_inactive(ip); spl_fstrans_unmark(cookie); } static void zpl_put_super(struct super_block *sb) { fstrans_cookie_t cookie; int error; cookie = spl_fstrans_mark(); error = -zfs_umount(sb); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); } static int zpl_sync_fs(struct super_block *sb, int wait) { fstrans_cookie_t cookie; cred_t *cr = CRED(); int error; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_sync(sb, wait, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_statfs(struct dentry *dentry, struct kstatfs *statp) { fstrans_cookie_t cookie; int error; cookie = spl_fstrans_mark(); error = -zfs_statvfs(dentry->d_inode, statp); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); /* * If required by a 32-bit system call, dynamically scale the * block size up to 16MiB and decrease the block counts. This * allows for a maximum size of 64EiB to be reported. The file * counts must be artificially capped at 2^32-1. */ if (unlikely(zpl_is_32bit_api())) { while (statp->f_blocks > UINT32_MAX && statp->f_bsize < SPA_MAXBLOCKSIZE) { statp->f_frsize <<= 1; statp->f_bsize <<= 1; statp->f_blocks >>= 1; statp->f_bfree >>= 1; statp->f_bavail >>= 1; } uint64_t usedobjs = statp->f_files - statp->f_ffree; statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs); statp->f_files = statp->f_ffree + usedobjs; } return (error); } static int zpl_remount_fs(struct super_block *sb, int *flags, char *data) { zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data }; fstrans_cookie_t cookie; int error; cookie = spl_fstrans_mark(); error = -zfs_remount(sb, flags, &zm); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); return (error); } static int +__zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs) +{ + char *fsname; + + fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + dmu_objset_name(zfsvfs->z_os, fsname); + seq_puts(seq, fsname); + kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); + + return (0); +} + +static int +zpl_show_devname(struct seq_file *seq, struct dentry *root) +{ + return (__zpl_show_devname(seq, root->d_sb->s_fs_info)); +} + +static int __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs) { seq_printf(seq, ",%s", zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr"); #ifdef CONFIG_FS_POSIX_ACL switch (zfsvfs->z_acl_type) { case ZFS_ACLTYPE_POSIXACL: seq_puts(seq, ",posixacl"); break; default: seq_puts(seq, ",noacl"); break; } #endif /* CONFIG_FS_POSIX_ACL */ return (0); } static int zpl_show_options(struct seq_file *seq, struct dentry *root) { return (__zpl_show_options(seq, root->d_sb->s_fs_info)); } static int zpl_fill_super(struct super_block *sb, void *data, int silent) { zfs_mnt_t *zm = (zfs_mnt_t *)data; fstrans_cookie_t cookie; int error; cookie = spl_fstrans_mark(); error = -zfs_domount(sb, zm, silent); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); return (error); } static int zpl_test_super(struct super_block *s, void *data) { zfsvfs_t *zfsvfs = s->s_fs_info; objset_t *os = data; if (zfsvfs == NULL) return (0); return (os == zfsvfs->z_os); } static struct super_block * zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) { struct super_block *s; objset_t *os; int err; err = dmu_objset_hold(zm->mnt_osname, FTAG, &os); if (err) return (ERR_PTR(-err)); /* * The dsl pool lock must be released prior to calling sget(). * It is possible sget() may block on the lock in grab_super() * while deactivate_super() holds that same lock and waits for * a txg sync. If the dsl_pool lock is held over sget() * this can prevent the pool sync and cause a deadlock. */ dsl_pool_rele(dmu_objset_pool(os), FTAG); s = sget(fs_type, zpl_test_super, set_anon_super, flags, os); dsl_dataset_rele(dmu_objset_ds(os), FTAG); if (IS_ERR(s)) return (ERR_CAST(s)); if (s->s_root == NULL) { err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0); if (err) { deactivate_locked_super(s); return (ERR_PTR(err)); } s->s_flags |= SB_ACTIVE; } else if ((flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); return (ERR_PTR(-EBUSY)); } return (s); } static struct dentry * zpl_mount(struct file_system_type *fs_type, int flags, const char *osname, void *data) { zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data }; struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm); if (IS_ERR(sb)) return (ERR_CAST(sb)); return (dget(sb->s_root)); } static void zpl_kill_sb(struct super_block *sb) { zfs_preumount(sb); kill_anon_super(sb); } void zpl_prune_sb(int64_t nr_to_scan, void *arg) { struct super_block *sb = (struct super_block *)arg; int objects = 0; (void) -zfs_prune(sb, nr_to_scan, &objects); } const struct super_operations zpl_super_operations = { .alloc_inode = zpl_inode_alloc, .destroy_inode = zpl_inode_destroy, .dirty_inode = zpl_dirty_inode, .write_inode = NULL, .evict_inode = zpl_evict_inode, .put_super = zpl_put_super, .sync_fs = zpl_sync_fs, .statfs = zpl_statfs, .remount_fs = zpl_remount_fs, + .show_devname = zpl_show_devname, .show_options = zpl_show_options, .show_stats = NULL, }; struct file_system_type zpl_fs_type = { .owner = THIS_MODULE, .name = ZFS_DRIVER, .mount = zpl_mount, .kill_sb = zpl_kill_sb, }; Index: vendor-sys/openzfs/dist/module/zfs/Makefile.in =================================================================== --- vendor-sys/openzfs/dist/module/zfs/Makefile.in (revision 365339) +++ vendor-sys/openzfs/dist/module/zfs/Makefile.in (revision 365340) @@ -1,153 +1,154 @@ ifneq ($(KBUILD_EXTMOD),) src = @abs_srcdir@ obj = @abs_builddir@ mfdir = $(obj) else mfdir = $(srctree)/$(src) endif MODULE := zfs obj-$(CONFIG_ZFS) := $(MODULE).o # Suppress unused-value warnings in sparc64 architecture headers ccflags-$(CONFIG_SPARC64) += -Wno-unused-value $(MODULE)-objs += abd.o $(MODULE)-objs += aggsum.o $(MODULE)-objs += arc.o $(MODULE)-objs += blkptr.o $(MODULE)-objs += bplist.o $(MODULE)-objs += bpobj.o $(MODULE)-objs += bptree.o $(MODULE)-objs += btree.o $(MODULE)-objs += bqueue.o $(MODULE)-objs += dataset_kstats.o $(MODULE)-objs += dbuf.o $(MODULE)-objs += dbuf_stats.o $(MODULE)-objs += ddt.o $(MODULE)-objs += ddt_zap.o $(MODULE)-objs += dmu.o $(MODULE)-objs += dmu_diff.o $(MODULE)-objs += dmu_object.o $(MODULE)-objs += dmu_objset.o $(MODULE)-objs += dmu_recv.o $(MODULE)-objs += dmu_redact.o $(MODULE)-objs += dmu_send.o $(MODULE)-objs += dmu_traverse.o $(MODULE)-objs += dmu_tx.o $(MODULE)-objs += dmu_zfetch.o $(MODULE)-objs += dnode.o $(MODULE)-objs += dnode_sync.o $(MODULE)-objs += dsl_bookmark.o $(MODULE)-objs += dsl_crypt.o $(MODULE)-objs += dsl_dataset.o $(MODULE)-objs += dsl_deadlist.o $(MODULE)-objs += dsl_deleg.o $(MODULE)-objs += dsl_destroy.o $(MODULE)-objs += dsl_dir.o $(MODULE)-objs += dsl_pool.o $(MODULE)-objs += dsl_prop.o $(MODULE)-objs += dsl_scan.o $(MODULE)-objs += dsl_synctask.o $(MODULE)-objs += dsl_userhold.o $(MODULE)-objs += edonr_zfs.o $(MODULE)-objs += fm.o $(MODULE)-objs += gzip.o $(MODULE)-objs += hkdf.o $(MODULE)-objs += lz4.o $(MODULE)-objs += lzjb.o $(MODULE)-objs += metaslab.o $(MODULE)-objs += mmp.o $(MODULE)-objs += multilist.o $(MODULE)-objs += objlist.o $(MODULE)-objs += pathname.o $(MODULE)-objs += range_tree.o $(MODULE)-objs += refcount.o $(MODULE)-objs += rrwlock.o $(MODULE)-objs += sa.o $(MODULE)-objs += sha256.o $(MODULE)-objs += skein_zfs.o $(MODULE)-objs += spa.o $(MODULE)-objs += spa_boot.o $(MODULE)-objs += spa_checkpoint.o $(MODULE)-objs += spa_config.o $(MODULE)-objs += spa_errlog.o $(MODULE)-objs += spa_history.o $(MODULE)-objs += spa_log_spacemap.o $(MODULE)-objs += spa_misc.o +$(MODULE)-objs += spa_stats.o $(MODULE)-objs += space_map.o $(MODULE)-objs += space_reftree.o $(MODULE)-objs += txg.o $(MODULE)-objs += uberblock.o $(MODULE)-objs += unique.o $(MODULE)-objs += vdev.o $(MODULE)-objs += vdev_cache.o $(MODULE)-objs += vdev_indirect.o $(MODULE)-objs += vdev_indirect_births.o $(MODULE)-objs += vdev_indirect_mapping.o $(MODULE)-objs += vdev_initialize.o $(MODULE)-objs += vdev_label.o $(MODULE)-objs += vdev_mirror.o $(MODULE)-objs += vdev_missing.o $(MODULE)-objs += vdev_queue.o $(MODULE)-objs += vdev_raidz.o $(MODULE)-objs += vdev_raidz_math.o $(MODULE)-objs += vdev_raidz_math_scalar.o $(MODULE)-objs += vdev_rebuild.o $(MODULE)-objs += vdev_removal.o $(MODULE)-objs += vdev_root.o $(MODULE)-objs += vdev_trim.o $(MODULE)-objs += zap.o $(MODULE)-objs += zap_leaf.o $(MODULE)-objs += zap_micro.o $(MODULE)-objs += zcp.o $(MODULE)-objs += zcp_get.o $(MODULE)-objs += zcp_global.o $(MODULE)-objs += zcp_iter.o $(MODULE)-objs += zcp_set.o $(MODULE)-objs += zcp_synctask.o $(MODULE)-objs += zfeature.o $(MODULE)-objs += zfs_byteswap.o $(MODULE)-objs += zfs_fm.o $(MODULE)-objs += zfs_fuid.o $(MODULE)-objs += zfs_ioctl.o $(MODULE)-objs += zfs_log.o $(MODULE)-objs += zfs_onexit.o $(MODULE)-objs += zfs_quota.o $(MODULE)-objs += zfs_ratelimit.o $(MODULE)-objs += zfs_replay.o $(MODULE)-objs += zfs_rlock.o $(MODULE)-objs += zfs_sa.o $(MODULE)-objs += zil.o $(MODULE)-objs += zio.o $(MODULE)-objs += zio_checksum.o $(MODULE)-objs += zio_compress.o $(MODULE)-objs += zio_inject.o $(MODULE)-objs += zle.o $(MODULE)-objs += zrlock.o $(MODULE)-objs += zthr.o $(MODULE)-objs += zvol.o # Suppress incorrect warnings from versions of objtool which are not # aware of x86 EVEX prefix instructions used for AVX512. OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512bw.o := y OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512f.o := y $(MODULE)-$(CONFIG_X86) += vdev_raidz_math_sse2.o $(MODULE)-$(CONFIG_X86) += vdev_raidz_math_ssse3.o $(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx2.o $(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512f.o $(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512bw.o $(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neon.o $(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neonx2.o $(MODULE)-$(CONFIG_PPC) += vdev_raidz_math_powerpc_altivec.o $(MODULE)-$(CONFIG_PPC64) += vdev_raidz_math_powerpc_altivec.o ifeq ($(CONFIG_ALTIVEC),y) $(obj)/vdev_raidz_math_powerpc_altivec.o: c_flags += -maltivec endif include $(mfdir)/../os/linux/zfs/Makefile Index: vendor-sys/openzfs/dist/module/zfs/arc.c =================================================================== --- vendor-sys/openzfs/dist/module/zfs/arc.c (revision 365339) +++ vendor-sys/openzfs/dist/module/zfs/arc.c (revision 365340) @@ -1,10582 +1,10583 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. * Copyright (c) 2011, 2019, Delphix. All rights reserved. * Copyright (c) 2014, Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. * Copyright (c) 2020, George Amanakis. All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2020, The FreeBSD Foundation [1] * * [1] Portions of this software were developed by Allan Jude * under sponsorship from the FreeBSD Foundation. */ /* * DVA-based Adjustable Replacement Cache * * While much of the theory of operation used here is * based on the self-tuning, low overhead replacement cache * presented by Megiddo and Modha at FAST 2003, there are some * significant differences: * * 1. The Megiddo and Modha model assumes any page is evictable. * Pages in its cache cannot be "locked" into memory. This makes * the eviction algorithm simple: evict the last page in the list. * This also make the performance characteristics easy to reason * about. Our cache is not so simple. At any given moment, some * subset of the blocks in the cache are un-evictable because we * have handed out a reference to them. Blocks are only evictable * when there are no external references active. This makes * eviction far more problematic: we choose to evict the evictable * blocks that are the "lowest" in the list. * * There are times when it is not possible to evict the requested * space. In these circumstances we are unable to adjust the cache * size. To prevent the cache growing unbounded at these times we * implement a "cache throttle" that slows the flow of new data * into the cache until we can make space available. * * 2. The Megiddo and Modha model assumes a fixed cache size. * Pages are evicted when the cache is full and there is a cache * miss. Our model has a variable sized cache. It grows with * high use, but also tries to react to memory pressure from the * operating system: decreasing its size when system memory is * tight. * * 3. The Megiddo and Modha model assumes a fixed page size. All * elements of the cache are therefore exactly the same size. So * when adjusting the cache size following a cache miss, its simply * a matter of choosing a single page to evict. In our model, we * have variable sized cache blocks (ranging from 512 bytes to * 128K bytes). We therefore choose a set of blocks to evict to make * space for a cache miss that approximates as closely as possible * the space used by the new block. * * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" * by N. Megiddo & D. Modha, FAST 2003 */ /* * The locking model: * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal ARC algorithms for * adjusting the cache use method 2. We therefore provide two * types of locks: 1) the hash table lock array, and 2) the * ARC list locks. * * Buffers do not have their own mutexes, rather they rely on the * hash table mutexes for the bulk of their protection (i.e. most * fields in the arc_buf_hdr_t are protected by these mutexes). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns * NULL for the mutex if the buffer was not in the table. * * buf_hash_remove() expects the appropriate hash mutex to be * already held before it is invoked. * * Each ARC state also has a mutex which is used to protect the * buffer list associated with the state. When attempting to * obtain a hash table lock while holding an ARC list lock you * must use: mutex_tryenter() to avoid deadlock. Also note that * the active state mutex must be held before the ghost state mutex. * * It as also possible to register a callback which is run when the * arc_meta_limit is reached and no buffers can be safely evicted. In * this case the arc user should drop a reference on some arc buffers so * they can be reclaimed and the arc_meta_limit honored. For example, * when using the ZPL each dentry holds a references on a znode. These * dentries must be pruned before the arc buffer holding the znode can * be safely evicted. * * Note that the majority of the performance stats are manipulated * with atomic operations. * * The L2ARC uses the l2ad_mtx on each vdev for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction * - L2ARC write completion, which walks L2ARC buflists * - ARC header destruction, as it removes from L2ARC buflists * - ARC header release, as it removes from L2ARC buflists */ /* * ARC operation: * * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. * This structure can point either to a block that is still in the cache or to * one that is only accessible in an L2 ARC device, or it can provide * information about a block that was recently evicted. If a block is * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough * information to retrieve it from the L2ARC device. This information is * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block * that is in this state cannot access the data directly. * * Blocks that are actively being referenced or have not been evicted * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within * the arc_buf_hdr_t that will point to the data block in memory. A block can * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). * * The L1ARC's data pointer may or may not be uncompressed. The ARC has the * ability to store the physical data (b_pabd) associated with the DVA of the * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, * it will match its on-disk compression characteristics. This behavior can be * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the * compressed ARC functionality is disabled, the b_pabd will point to an * uncompressed version of the on-disk data. * * Data in the L1ARC is not accessed by consumers of the ARC directly. Each * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC * consumer. The ARC will provide references to this data and will keep it * cached until it is no longer in use. The ARC caches only the L1ARC's physical * data block and will evict any arc_buf_t that is no longer referenced. The * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the * "overhead_size" kstat. * * Depending on the consumer, an arc_buf_t can be requested in uncompressed or * compressed form. The typical case is that consumers will want uncompressed * data, and when that happens a new data buffer is allocated where the data is * decompressed for them to use. Currently the only consumer who wants * compressed arc_buf_t's is "zfs send", when it streams data exactly as it * exists on disk. When this happens, the arc_buf_t's data buffer is shared * with the arc_buf_hdr_t. * * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The * first one is owned by a compressed send consumer (and therefore references * the same compressed data buffer as the arc_buf_hdr_t) and the second could be * used by any other consumer (and has its own uncompressed copy of the data * buffer). * * arc_buf_hdr_t * +-----------+ * | fields | * | common to | * | L1- and | * | L2ARC | * +-----------+ * | l2arc_buf_hdr_t * | | * +-----------+ * | l1arc_buf_hdr_t * | | arc_buf_t * | b_buf +------------>+-----------+ arc_buf_t * | b_pabd +-+ |b_next +---->+-----------+ * +-----------+ | |-----------| |b_next +-->NULL * | |b_comp = T | +-----------+ * | |b_data +-+ |b_comp = F | * | +-----------+ | |b_data +-+ * +->+------+ | +-----------+ | * compressed | | | | * data | |<--------------+ | uncompressed * +------+ compressed, | data * shared +-->+------+ * data | | * | | * +------+ * * When a consumer reads a block, the ARC must first look to see if the * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new * arc_buf_t and either copies uncompressed data into a new data buffer from an * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the * hdr is compressed and the desired compression characteristics of the * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be * the last buffer in the hdr's b_buf list, however a shared compressed buf can * be anywhere in the hdr's list. * * The diagram below shows an example of an uncompressed ARC hdr that is * sharing its data with an arc_buf_t (note that the shared uncompressed buf is * the last element in the buf list): * * arc_buf_hdr_t * +-----------+ * | | * | | * | | * +-----------+ * l2arc_buf_hdr_t| | * | | * +-----------+ * l1arc_buf_hdr_t| | * | | arc_buf_t (shared) * | b_buf +------------>+---------+ arc_buf_t * | | |b_next +---->+---------+ * | b_pabd +-+ |---------| |b_next +-->NULL * +-----------+ | | | +---------+ * | |b_data +-+ | | * | +---------+ | |b_data +-+ * +->+------+ | +---------+ | * | | | | * uncompressed | | | | * data +------+ | | * ^ +->+------+ | * | uncompressed | | | * | data | | | * | +------+ | * +---------------------------------+ * * Writing to the ARC requires that the ARC first discard the hdr's b_pabd * since the physical block is about to be rewritten. The new data contents * will be contained in the arc_buf_t. As the I/O pipeline performs the write, * it may compress the data before writing it to disk. The ARC will be called * with the transformed data and will bcopy the transformed on-disk block into * a newly allocated b_pabd. Writes are always done into buffers which have * either been loaned (and hence are new and don't have other readers) or * buffers which have been released (and hence have their own hdr, if there * were originally other readers of the buf's original hdr). This ensures that * the ARC only needs to update a single buf and its hdr after a write occurs. * * When the L2ARC is in use, it will also take advantage of the b_pabd. The * L2ARC will always write the contents of b_pabd to the L2ARC. This means * that when compressed ARC is enabled that the L2ARC blocks are identical * to the on-disk block in the main data pool. This provides a significant * advantage since the ARC can leverage the bp's checksum when reading from the * L2ARC to determine if the contents are valid. However, if the compressed * ARC is disabled, then the L2ARC's block must be transformed to look * like the physical block in the main data pool before comparing the * checksum and determining its validity. * * The L1ARC has a slightly different system for storing encrypted data. * Raw (encrypted + possibly compressed) data has a few subtle differences from * data that is just compressed. The biggest difference is that it is not * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded. * The other difference is that encryption cannot be treated as a suggestion. * If a caller would prefer compressed data, but they actually wind up with * uncompressed data the worst thing that could happen is there might be a * performance hit. If the caller requests encrypted data, however, we must be * sure they actually get it or else secret information could be leaked. Raw * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore, * may have both an encrypted version and a decrypted version of its data at * once. When a caller needs a raw arc_buf_t, it is allocated and the data is * copied out of this header. To avoid complications with b_pabd, raw buffers * cannot be shared. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ boolean_t arc_watch = B_FALSE; #endif /* * This thread's job is to keep enough free memory in the system, by * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves * arc_available_memory(). */ static zthr_t *arc_reap_zthr; /* * This thread's job is to keep arc_size under arc_c, by calling * arc_evict(), which improves arc_is_overflowing(). */ static zthr_t *arc_evict_zthr; static kmutex_t arc_evict_lock; static boolean_t arc_evict_needed = B_FALSE; /* * Count of bytes evicted since boot. */ static uint64_t arc_evict_count; /* * List of arc_evict_waiter_t's, representing threads waiting for the * arc_evict_count to reach specific values. */ static list_t arc_evict_waiters; /* * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of * the requested amount of data to be evicted. For example, by default for * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation. * Since this is above 100%, it ensures that progress is made towards getting * arc_size under arc_c. Since this is finite, it ensures that allocations * can still happen, even during the potentially long time that arc_size is * more than arc_c. */ int zfs_arc_eviction_pct = 200; /* * The number of headers to evict in arc_evict_state_impl() before * dropping the sublist lock and evicting from another sublist. A lower * value means we're more likely to evict the "correct" header (i.e. the * oldest header in the arc state), but comes with higher overhead * (i.e. more invocations of arc_evict_state_impl()). */ int zfs_arc_evict_batch_limit = 10; /* number of seconds before growing cache again */ int arc_grow_retry = 5; /* * Minimum time between calls to arc_kmem_reap_soon(). */ int arc_kmem_cache_reap_retry_ms = 1000; /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ int zfs_arc_overflow_shift = 8; /* shift of arc_c for calculating both min and max arc_p */ int arc_p_min_shift = 4; /* log2(fraction of arc to reclaim) */ int arc_shrink_shift = 7; /* percent of pagecache to reclaim arc to */ #ifdef _KERNEL uint_t zfs_arc_pc_percent = 0; #endif /* * log2(fraction of ARC which must be free to allow growing). * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, * when reading a new block into the ARC, we will evict an equal-sized block * from the ARC. * * This must be less than arc_shrink_shift, so that when we shrink the ARC, * we will still not allow it to grow. */ int arc_no_grow_shift = 5; /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ static int arc_min_prefetch_ms; static int arc_min_prescient_prefetch_ms; /* * If this percent of memory is free, don't throttle. */ int arc_lotsfree_percent = 10; /* * The arc has filled available memory and has now warmed up. */ boolean_t arc_warm; /* * These tunables are for performance analysis. */ unsigned long zfs_arc_max = 0; unsigned long zfs_arc_min = 0; unsigned long zfs_arc_meta_limit = 0; unsigned long zfs_arc_meta_min = 0; unsigned long zfs_arc_dnode_limit = 0; unsigned long zfs_arc_dnode_reduce_percent = 10; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ /* * ARC dirty data constraints for arc_tempreserve_space() throttle. */ unsigned long zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */ unsigned long zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */ unsigned long zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */ /* * Enable or disable compressed arc buffers. */ int zfs_compressed_arc_enabled = B_TRUE; /* * ARC will evict meta buffers that exceed arc_meta_limit. This * tunable make arc_meta_limit adjustable for different workloads. */ unsigned long zfs_arc_meta_limit_percent = 75; /* * Percentage that can be consumed by dnodes of ARC meta buffers. */ unsigned long zfs_arc_dnode_limit_percent = 10; /* * These tunables are Linux specific */ unsigned long zfs_arc_sys_free = 0; int zfs_arc_min_prefetch_ms = 0; int zfs_arc_min_prescient_prefetch_ms = 0; int zfs_arc_p_dampener_disable = 1; int zfs_arc_meta_prune = 10000; int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED; int zfs_arc_meta_adjust_restarts = 4096; int zfs_arc_lotsfree_percent = 10; /* The 6 states: */ arc_state_t ARC_anon; arc_state_t ARC_mru; arc_state_t ARC_mru_ghost; arc_state_t ARC_mfu; arc_state_t ARC_mfu_ghost; arc_state_t ARC_l2c_only; arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, { "demand_data_misses", KSTAT_DATA_UINT64 }, { "demand_metadata_hits", KSTAT_DATA_UINT64 }, { "demand_metadata_misses", KSTAT_DATA_UINT64 }, { "prefetch_data_hits", KSTAT_DATA_UINT64 }, { "prefetch_data_misses", KSTAT_DATA_UINT64 }, { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, { "mru_hits", KSTAT_DATA_UINT64 }, { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "access_skip", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "p", KSTAT_DATA_UINT64 }, { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "compressed_size", KSTAT_DATA_UINT64 }, { "uncompressed_size", KSTAT_DATA_UINT64 }, { "overhead_size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "metadata_size", KSTAT_DATA_UINT64 }, { "dbuf_size", KSTAT_DATA_UINT64 }, { "dnode_size", KSTAT_DATA_UINT64 }, { "bonus_size", KSTAT_DATA_UINT64 }, #if defined(COMPAT_FREEBSD11) { "other_size", KSTAT_DATA_UINT64 }, #endif { "anon_size", KSTAT_DATA_UINT64 }, { "anon_evictable_data", KSTAT_DATA_UINT64 }, { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_size", KSTAT_DATA_UINT64 }, { "mru_evictable_data", KSTAT_DATA_UINT64 }, { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_size", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_size", KSTAT_DATA_UINT64 }, { "mfu_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_size", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 }, { "l2_write_bytes", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "l2_log_blk_writes", KSTAT_DATA_UINT64 }, { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 }, { "l2_log_blk_asize", KSTAT_DATA_UINT64 }, { "l2_log_blk_count", KSTAT_DATA_UINT64 }, { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 }, { "l2_rebuild_success", KSTAT_DATA_UINT64 }, { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 }, { "l2_rebuild_size", KSTAT_DATA_UINT64 }, { "l2_rebuild_asize", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 }, { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "memory_direct_count", KSTAT_DATA_UINT64 }, { "memory_indirect_count", KSTAT_DATA_UINT64 }, { "memory_all_bytes", KSTAT_DATA_UINT64 }, { "memory_free_bytes", KSTAT_DATA_UINT64 }, { "memory_available_bytes", KSTAT_DATA_INT64 }, { "arc_no_grow", KSTAT_DATA_UINT64 }, { "arc_tempreserve", KSTAT_DATA_UINT64 }, { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, { "arc_prune", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_dnode_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, { "arc_meta_min", KSTAT_DATA_UINT64 }, { "async_upgrade_sync", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, { "arc_need_free", KSTAT_DATA_UINT64 }, { "arc_sys_free", KSTAT_DATA_UINT64 }, { "arc_raw_size", KSTAT_DATA_UINT64 }, { "cached_only_in_progress", KSTAT_DATA_UINT64 }, { "abd_chunk_waste_size", KSTAT_DATA_UINT64 }, }; #define ARCSTAT_MAX(stat, val) { \ uint64_t m; \ while ((val) > (m = arc_stats.stat.value.ui64) && \ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ continue; \ } #define ARCSTAT_MAXSTAT(stat) \ ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) /* * We define a macro to allow ARC hits/misses to be easily broken down by * two separate conditions, giving a total of four different subtypes for * each of hits and misses (so eight statistics total). */ #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ if (cond1) { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ } \ } else { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ } \ } /* * This macro allows us to use kstats as floating averages. Each time we * update this kstat, we first factor it and the update value by * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall * average. This macro assumes that integer loads and stores are atomic, but * is not safe for multiple writers updating the kstat in parallel (only the * last writer's update will remain). */ #define ARCSTAT_F_AVG_FACTOR 3 #define ARCSTAT_F_AVG(stat, value) \ do { \ uint64_t x = ARCSTAT(stat); \ x = x - x / ARCSTAT_F_AVG_FACTOR + \ (value) / ARCSTAT_F_AVG_FACTOR; \ ARCSTAT(stat) = x; \ _NOTE(CONSTCOND) \ } while (0) kstat_t *arc_ksp; static arc_state_t *arc_anon; static arc_state_t *arc_mru_ghost; static arc_state_t *arc_mfu_ghost; static arc_state_t *arc_l2c_only; arc_state_t *arc_mru; arc_state_t *arc_mfu; /* * There are several ARC variables that are critical to export as kstats -- * but we don't want to have to grovel around in the kstat whenever we wish to * manipulate them. For these variables, we therefore define them to be in * terms of the statistic variable. This assures that we are not introducing * the possibility of inconsistency by having shadow copies of the variables, * while still allowing the code to be readable. */ #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ /* max size for dnodes */ #define arc_dnode_size_limit ARCSTAT(arcstat_dnode_limit) #define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ #define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */ /* size of all b_rabd's in entire arc */ #define arc_raw_size ARCSTAT(arcstat_raw_size) /* compressed size of entire arc */ #define arc_compressed_size ARCSTAT(arcstat_compressed_size) /* uncompressed size of entire arc */ #define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) /* number of bytes in the arc from arc_buf_t's */ #define arc_overhead_size ARCSTAT(arcstat_overhead_size) /* * There are also some ARC variables that we want to export, but that are * updated so often that having the canonical representation be the statistic * variable causes a performance bottleneck. We want to use aggsum_t's for these * instead, but still be able to export the kstat in the same way as before. * The solution is to always use the aggsum version, except in the kstat update * callback. */ aggsum_t arc_size; aggsum_t arc_meta_used; aggsum_t astat_data_size; aggsum_t astat_metadata_size; aggsum_t astat_dbuf_size; aggsum_t astat_dnode_size; aggsum_t astat_bonus_size; aggsum_t astat_hdr_size; aggsum_t astat_l2_hdr_size; aggsum_t astat_abd_chunk_waste_size; hrtime_t arc_growtime; list_t arc_prune_list; kmutex_t arc_prune_mtx; taskq_t *arc_prune_taskq; #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) #define HDR_PRESCIENT_PREFETCH(hdr) \ ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) #define HDR_COMPRESSION_ENABLED(hdr) \ ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) #define HDR_L2_READING(hdr) \ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) #define HDR_PROTECTED(hdr) ((hdr)->b_flags & ARC_FLAG_PROTECTED) #define HDR_NOAUTH(hdr) ((hdr)->b_flags & ARC_FLAG_NOAUTH) #define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) #define HDR_ISTYPE_METADATA(hdr) \ ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) #define HDR_HAS_RABD(hdr) \ (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) && \ (hdr)->b_crypt_hdr.b_rabd != NULL) #define HDR_ENCRYPTED(hdr) \ (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) #define HDR_AUTHENTICATED(hdr) \ (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) /* For storing compression mode in b_flags */ #define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) #define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) #define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) #define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) #define ARC_BUF_ENCRYPTED(buf) ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED) /* * Other sizes */ #define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) #define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr)) #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* * Hash table routines */ #define HT_LOCK_ALIGN 64 #define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN))) struct ht_lock { kmutex_t ht_lock; #ifdef _KERNEL unsigned char pad[HT_LOCK_PAD]; #endif }; #define BUF_LOCKS 8192 typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; struct ht_lock ht_locks[BUF_LOCKS]; } buf_hash_table_t; static buf_hash_table_t buf_hash_table; #define BUF_HASH_INDEX(spa, dva, birth) \ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) #define HDR_LOCK(hdr) \ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; /* * Level 2 ARC */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ #define L2ARC_HEADROOM 2 /* num of writes */ /* * If we discover during ARC scan any buffers to be compressed, we boost * our headroom for the next scanning cycle by this percentage multiple. */ #define L2ARC_HEADROOM_BOOST 200 #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ /* * We can feed L2ARC from two states of ARC buffers, mru and mfu, * and each of the state has two types: data and metadata. */ #define L2ARC_FEED_TYPES 4 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) /* L2ARC Performance Tunables */ unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ int l2arc_feed_again = B_TRUE; /* turbo warmup */ int l2arc_norw = B_FALSE; /* no reads during writes */ int l2arc_meta_percent = 33; /* limit on headers size */ /* * L2ARC Internals */ static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { arc_buf_hdr_t *l2rcb_hdr; /* read header */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ abd_t *l2rcb_abd; /* temporary buffer */ } l2arc_read_callback_t; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ abd_t *l2df_abd; size_t l2df_size; arc_buf_contents_t l2df_type; list_node_t l2df_list_node; } l2arc_data_free_t; typedef enum arc_fill_flags { ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */ ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */ ARC_FILL_ENCRYPTED = 1 << 2, /* fill with encrypted data */ ARC_FILL_NOAUTH = 1 << 3, /* don't attempt to authenticate */ ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */ } arc_fill_flags_t; static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; static kmutex_t l2arc_rebuild_thr_lock; static kcondvar_t l2arc_rebuild_thr_cv; enum arc_hdr_alloc_flags { ARC_HDR_ALLOC_RDATA = 0x1, ARC_HDR_DO_ADAPT = 0x2, }; static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t); static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t); static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static void arc_buf_watch(arc_buf_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); static void l2arc_do_free_on_write(void); /* * L2ARC TRIM * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of * the current write size (l2arc_write_max) we should TRIM if we * have filled the device. It is defined as a percentage of the * write size. If set to 100 we trim twice the space required to * accommodate upcoming writes. A minimum of 64MB will be trimmed. * It also enables TRIM of the whole L2ARC device upon creation or * addition to an existing pool or if the header of the device is * invalid upon importing a pool or onlining a cache device. The * default is 0, which disables TRIM on L2ARC altogether as it can * put significant stress on the underlying storage devices. This * will vary depending of how well the specific device handles * these commands. */ unsigned long l2arc_trim_ahead = 0; /* * Performance tuning of L2ARC persistence: * * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding * an L2ARC device (either at pool import or later) will attempt * to rebuild L2ARC buffer contents. * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls * whether log blocks are written to the L2ARC device. If the L2ARC * device is less than 1GB, the amount of data l2arc_evict() * evicts is significant compared to the amount of restored L2ARC * data. In this case do not write log blocks in L2ARC in order * not to waste space. */ int l2arc_rebuild_enabled = B_TRUE; unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024; /* L2ARC persistence rebuild control routines. */ void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); static void l2arc_dev_rebuild_thread(void *arg); static int l2arc_rebuild(l2arc_dev_t *dev); /* L2ARC persistence read I/O routines. */ static int l2arc_dev_hdr_read(l2arc_dev_t *dev); static int l2arc_log_blk_read(l2arc_dev_t *dev, const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp, l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, zio_t *this_io, zio_t **next_io); static zio_t *l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb); static void l2arc_log_blk_fetch_abort(zio_t *zio); /* L2ARC persistence block restoration routines. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, uint64_t lb_asize, uint64_t lb_daddr); static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); /* L2ARC persistence write I/O routines. */ static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb); /* L2ARC persistence auxiliary routines. */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp); static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab); boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check); static void l2arc_blk_fetch_done(zio_t *zio); static inline uint64_t l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev); /* * We use Cityhash for this. It's fast, and has good hash properties without * requiring any large static buffers. */ static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth)); } #define HDR_EMPTY(hdr) \ ((hdr)->b_dva.dva_word[0] == 0 && \ (hdr)->b_dva.dva_word[1] == 0) #define HDR_EMPTY_OR_LOCKED(hdr) \ (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr))) #define HDR_EQUAL(spa, dva, birth, hdr) \ ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) { hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; } static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { const dva_t *dva = BP_IDENTITY(bp); uint64_t birth = BP_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *hdr; mutex_enter(hash_lock); for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; hdr = hdr->b_hash_next) { if (HDR_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; return (hdr); } } mutex_exit(hash_lock); *lockp = NULL; return (NULL); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. * If lockp == NULL, the caller is assumed to already hold the hash lock. */ static arc_buf_hdr_t * buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *fhdr; uint32_t i; ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); ASSERT(hdr->b_birth != 0); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (lockp != NULL) { *lockp = hash_lock; mutex_enter(hash_lock); } else { ASSERT(MUTEX_HELD(hash_lock)); } for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) return (fhdr); } hdr->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = hdr; arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ if (i > 0) { ARCSTAT_BUMP(arcstat_hash_collisions); if (i == 1) ARCSTAT_BUMP(arcstat_hash_chains); ARCSTAT_MAX(arcstat_hash_chain_max, i); } ARCSTAT_BUMP(arcstat_hash_elements); ARCSTAT_MAXSTAT(arcstat_hash_elements); return (NULL); } static void buf_hash_remove(arc_buf_hdr_t *hdr) { arc_buf_hdr_t *fhdr, **hdrp; uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdrp = &buf_hash_table.ht_table[idx]; while ((fhdr = *hdrp) != hdr) { ASSERT3P(fhdr, !=, NULL); hdrp = &fhdr->b_hash_next; } *hdrp = hdr->b_hash_next; hdr->b_hash_next = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); if (buf_hash_table.ht_table[idx] && buf_hash_table.ht_table[idx]->b_hash_next == NULL) ARCSTAT_BUMPDOWN(arcstat_hash_chains); } /* * Global data structures and functions for the buf kmem cache. */ static kmem_cache_t *hdr_full_cache; static kmem_cache_t *hdr_full_crypt_cache; static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; static void buf_fini(void) { int i; #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages * should be using vmem_free() in the linux kernel\ */ vmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); #else kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); #endif for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); kmem_cache_destroy(hdr_full_cache); kmem_cache_destroy(hdr_full_crypt_cache); kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } /* * Constructor callback - called when the cache is empty * and a new buf is requested. */ /* ARGSUSED */ static int hdr_full_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; bzero(hdr, HDR_FULL_SIZE); hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); list_link_init(&hdr->b_l1hdr.b_arc_node); list_link_init(&hdr->b_l2hdr.b_l2node); multilist_link_init(&hdr->b_l1hdr.b_arc_node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); } /* ARGSUSED */ static int hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; hdr_full_cons(vbuf, unused, kmflag); bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr)); arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); return (0); } /* ARGSUSED */ static int hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; bzero(hdr, HDR_L2ONLY_SIZE); arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } /* ARGSUSED */ static int buf_cons(void *vbuf, void *unused, int kmflag) { arc_buf_t *buf = vbuf; bzero(buf, sizeof (arc_buf_t)); mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); } /* * Destructor callback - called when a cached buf is * no longer required. */ /* ARGSUSED */ static void hdr_full_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); cv_destroy(&hdr->b_l1hdr.b_cv); zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt); mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); } /* ARGSUSED */ static void hdr_full_crypt_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; hdr_full_dest(vbuf, unused); arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); } /* ARGSUSED */ static void hdr_l2only_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr __maybe_unused = vbuf; ASSERT(HDR_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } /* ARGSUSED */ static void buf_dest(void *vbuf, void *unused) { arc_buf_t *buf = vbuf; mutex_destroy(&buf->b_evict_lock); arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } static void buf_init(void) { uint64_t *ct = NULL; uint64_t hsize = 1ULL << 12; int i, j; /* * The hash table is big enough to fill all of physical memory * with an average block size of zfs_arc_average_blocksize (default 8K). * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ while (hsize * zfs_arc_average_blocksize < arc_all_memory()) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages * should be using vmem_alloc() in the linux kernel */ buf_hash_table.ht_table = vmem_zalloc(hsize * sizeof (void*), KM_SLEEP); #else buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); #endif if (buf_hash_table.ht_table == NULL) { ASSERT(hsize > (1ULL << 8)); hsize >>= 1; goto retry; } hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0); hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt", HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest, NULL, NULL, NULL, 0); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); for (i = 0; i < BUF_LOCKS; i++) { mutex_init(&buf_hash_table.ht_locks[i].ht_lock, NULL, MUTEX_DEFAULT, NULL); } } #define ARC_MINTIME (hz>>4) /* 62 ms */ /* * This is the size that the buf occupies in memory. If the buf is compressed, * it will correspond to the compressed size. You should use this method of * getting the buf size unless you explicitly need the logical size. */ uint64_t arc_buf_size(arc_buf_t *buf) { return (ARC_BUF_COMPRESSED(buf) ? HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); } uint64_t arc_buf_lsize(arc_buf_t *buf) { return (HDR_GET_LSIZE(buf->b_hdr)); } /* * This function will return B_TRUE if the buffer is encrypted in memory. * This buffer can be decrypted by calling arc_untransform(). */ boolean_t arc_is_encrypted(arc_buf_t *buf) { return (ARC_BUF_ENCRYPTED(buf) != 0); } /* * Returns B_TRUE if the buffer represents data that has not had its MAC * verified yet. */ boolean_t arc_is_unauthenticated(arc_buf_t *buf) { return (HDR_NOAUTH(buf->b_hdr) != 0); } void arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt, uint8_t *iv, uint8_t *mac) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_PROTECTED(hdr)); bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; } /* * Indicates how this buffer is compressed in memory. If it is not compressed * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with * arc_untransform() as long as it is also unencrypted. */ enum zio_compress arc_get_compression(arc_buf_t *buf) { return (ARC_BUF_COMPRESSED(buf) ? HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); } /* * Return the compression algorithm used to store this data in the ARC. If ARC * compression is enabled or this is an encrypted block, this will be the same * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF. */ static inline enum zio_compress arc_hdr_get_compress(arc_buf_hdr_t *hdr) { return (HDR_COMPRESSION_ENABLED(hdr) ? HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF); } uint8_t arc_get_complevel(arc_buf_t *buf) { return (buf->b_hdr->b_complevel); } static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { boolean_t shared = (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); IMPLY(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); /* * It would be nice to assert arc_can_share() too, but the "hdr isn't * already being shared" requirement prevents us from doing that. */ return (shared); } /* * Free the checksum associated with this header. If there is no checksum, this * is a no-op. */ static inline void arc_cksum_free(arc_buf_hdr_t *hdr) { ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_l1hdr.b_freeze_cksum = NULL; } mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } /* * Return true iff at least one of the bufs on hdr is not compressed. * Encrypted buffers count as compressed. */ static boolean_t arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) { ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr)); for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { if (!ARC_BUF_COMPRESSED(b)) { return (B_TRUE); } } return (B_FALSE); } /* * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data * matches the checksum that is stored in the hdr. If there is no checksum, * or if the buf is compressed, this is a no-op. */ static void arc_cksum_verify(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } /* * This function makes the assumption that data stored in the L2ARC * will be transformed exactly as it is in the main pool. Because of * this we can verify the checksum against the reading process's bp. */ static boolean_t arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); /* * Block pointers always store the checksum for the logical data. * If the block pointer has the gang bit set, then the checksum * it represents is for the reconstituted data and not for an * individual gang member. The zio pipeline, however, must be able to * determine the checksum of each of the gang constituents so it * treats the checksum comparison differently than what we need * for l2arc blocks. This prevents us from using the * zio_checksum_error() interface directly. Instead we must call the * zio_checksum_error_impl() so that we can ensure the checksum is * generated using the correct checksum algorithm and accounts for the * logical I/O size and not just a gang fragment. */ return (zio_checksum_error_impl(zio->io_spa, zio->io_bp, BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, zio->io_offset, NULL) == 0); } /* * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a * checksum and attaches it to the buf's hdr so that we can ensure that the buf * isn't modified later on. If buf is compressed or there is already a checksum * on the hdr, this is a no-op (we only checksum uncompressed bufs). */ static void arc_cksum_compute(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(!ARC_BUF_COMPRESSED(buf)); hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, hdr->b_l1hdr.b_freeze_cksum); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); arc_buf_watch(buf); } #ifndef _KERNEL void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused) { panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr); } #endif /* ARGSUSED */ static void arc_buf_unwatch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ | PROT_WRITE)); } #endif } /* ARGSUSED */ static void arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ)); #endif } static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *hdr) { arc_buf_contents_t type; if (HDR_ISTYPE_METADATA(hdr)) { type = ARC_BUFC_METADATA; } else { type = ARC_BUFC_DATA; } VERIFY3U(hdr->b_type, ==, type); return (type); } boolean_t arc_is_metadata(arc_buf_t *buf) { return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); } static uint32_t arc_bufc_to_flags(arc_buf_contents_t type) { switch (type) { case ARC_BUFC_DATA: /* metadata field is 0 if buffer contains normal data */ return (0); case ARC_BUFC_METADATA: return (ARC_FLAG_BUFC_METADATA); default: break; } panic("undefined ARC buffer type!"); return ((uint32_t)-1); } void arc_buf_thaw(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_cksum_verify(buf); /* * Compressed buffers do not manipulate the b_freeze_cksum. */ if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(hdr)); arc_cksum_free(hdr); arc_buf_unwatch(buf); } void arc_buf_freeze(arc_buf_t *buf) { if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(buf->b_hdr)); arc_cksum_compute(buf); } /* * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, * the following functions should be used to ensure that the flags are * updated in a thread-safe way. When manipulating the flags either * the hash_lock must be held or the hdr must be undiscoverable. This * ensures that we're not racing with any other threads when updating * the flags. */ static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags |= flags; } static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags &= ~flags; } /* * Setting the compression bits in the arc_buf_hdr_t's b_flags is * done in a special way since we have to clear and set bits * at the same time. Consumers that wish to set the compression bits * must use this function to ensure that the flags are updated in * thread-safe manner. */ static void arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Holes and embedded blocks will always have a psize = 0 so * we ignore the compression of the blkptr and set the * want to uncompress them. Mark them as uncompressed. */ if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); } else { arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); ASSERT(HDR_COMPRESSION_ENABLED(hdr)); } HDR_SET_COMPRESS(hdr, cmp); ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); } /* * Looks for another buf on the same hdr which has the data decompressed, copies * from it, and returns true. If no such buf exists, returns false. */ static boolean_t arc_buf_try_copy_decompressed_data(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; boolean_t copied = B_FALSE; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(buf->b_data, !=, NULL); ASSERT(!ARC_BUF_COMPRESSED(buf)); for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; from = from->b_next) { /* can't use our own data buffer */ if (from == buf) { continue; } if (!ARC_BUF_COMPRESSED(from)) { bcopy(from->b_data, buf->b_data, arc_buf_size(buf)); copied = B_TRUE; break; } } /* * There were no decompressed bufs, so there should not be a * checksum on the hdr either. */ if (zfs_flags & ZFS_DEBUG_MODIFY) EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); return (copied); } /* * Allocates an ARC buf header that's in an evicted & L2-cached state. * This is used during l2arc reconstruction to make empty ARC buffers * which circumvent the regular disk->arc->l2arc path and instead come * into being in the reverse order, i.e. l2arc->arc. */ static arc_buf_hdr_t * arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth, enum zio_compress compress, uint8_t complevel, boolean_t protected, boolean_t prefetch) { arc_buf_hdr_t *hdr; ASSERT(size != 0); hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP); hdr->b_birth = birth; hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR); HDR_SET_LSIZE(hdr, size); HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); hdr->b_complevel = complevel; if (protected) arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); if (prefetch) arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa); hdr->b_dva = dva; hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_daddr = daddr; return (hdr); } /* * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. */ static uint64_t arc_hdr_size(arc_buf_hdr_t *hdr) { uint64_t size; if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && HDR_GET_PSIZE(hdr) > 0) { size = HDR_GET_PSIZE(hdr); } else { ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); size = HDR_GET_LSIZE(hdr); } return (size); } static int arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) { int ret; uint64_t csize; uint64_t lsize = HDR_GET_LSIZE(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); void *tmpbuf = NULL; abd_t *abd = hdr->b_l1hdr.b_pabd; ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_AUTHENTICATED(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * The MAC is calculated on the compressed data that is stored on disk. * However, if compressed arc is disabled we will only have the * decompressed data available to us now. Compress it into a temporary * abd so we can verify the MAC. The performance overhead of this will * be relatively low, since most objects in an encrypted objset will * be encrypted (instead of authenticated) anyway. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { tmpbuf = zio_buf_alloc(lsize); abd = abd_get_from_buf(tmpbuf, lsize); abd_take_ownership_of_buf(abd, B_TRUE); csize = zio_compress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmpbuf, lsize, hdr->b_complevel); ASSERT3U(csize, <=, psize); abd_zero_off(abd, csize, psize - csize); } /* * Authentication is best effort. We authenticate whenever the key is * available. If we succeed we clear ARC_FLAG_NOAUTH. */ if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) { ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); ASSERT3U(lsize, ==, psize); ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd, psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); } else { ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize, hdr->b_crypt_hdr.b_mac); } if (ret == 0) arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH); else if (ret != ENOENT) goto error; if (tmpbuf != NULL) abd_free(abd); return (0); error: if (tmpbuf != NULL) abd_free(abd); return (ret); } /* * This function will take a header that only has raw encrypted data in * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in * b_l1hdr.b_pabd. If designated in the header flags, this function will * also decompress the data. */ static int arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) { int ret; abd_t *cabd = NULL; void *tmp = NULL; boolean_t no_crypt = B_FALSE; boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_ENCRYPTED(hdr)); arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot, B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd, &no_crypt); if (ret != 0) goto error; if (no_crypt) { abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd, HDR_GET_PSIZE(hdr)); } /* * If this header has disabled arc compression but the b_pabd is * compressed after decrypting it, we need to decompress the newly * decrypted data. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { /* * We want to make sure that we are correctly honoring the * zfs_abd_scatter_enabled setting, so we allocate an abd here * and then loan a buffer from it, rather than allocating a * linear buffer and wrapping it in an abd later. */ cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, B_TRUE); tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { abd_return_buf(cabd, tmp, arc_hdr_size(hdr)); goto error; } abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = cabd; } return (0); error: arc_hdr_free_abd(hdr, B_FALSE); if (cabd != NULL) arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr); return (ret); } /* * This function is called during arc_buf_fill() to prepare the header's * abd plaintext pointer for use. This involves authenticated protected * data and decrypting encrypted data into the plaintext abd. */ static int arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa, const zbookmark_phys_t *zb, boolean_t noauth) { int ret; ASSERT(HDR_PROTECTED(hdr)); if (hash_lock != NULL) mutex_enter(hash_lock); if (HDR_NOAUTH(hdr) && !noauth) { /* * The caller requested authenticated data but our data has * not been authenticated yet. Verify the MAC now if we can. */ ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset); if (ret != 0) goto error; } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) { /* * If we only have the encrypted version of the data, but the * unencrypted version was requested we take this opportunity * to store the decrypted version in the header for future use. */ ret = arc_hdr_decrypt(hdr, spa, zb); if (ret != 0) goto error; } ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); if (hash_lock != NULL) mutex_exit(hash_lock); return (0); error: if (hash_lock != NULL) mutex_exit(hash_lock); return (ret); } /* * This function is used by the dbuf code to decrypt bonus buffers in place. * The dbuf code itself doesn't have any locking for decrypting a shared dnode * block, so we use the hash lock here to protect against concurrent calls to * arc_buf_fill(). */ static void arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_ENCRYPTED(hdr)); ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; hdr->b_crypt_hdr.b_ebufcnt -= 1; } /* * Given a buf that has a data buffer attached to it, this function will * efficiently fill the buf with data of the specified compression setting from * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr * are already sharing a data buf, no copy is performed. * * If the buf is marked as compressed but uncompressed data was requested, this * will allocate a new data buffer for the buf, remove that flag, and fill the * buf with uncompressed data. You can't request a compressed buf on a hdr with * uncompressed data, and (since we haven't added support for it yet) if you * want compressed data your buf must already be marked as compressed and have * the correct-sized data buffer. */ static int arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, arc_fill_flags_t flags) { int error = 0; arc_buf_hdr_t *hdr = buf->b_hdr; boolean_t hdr_compressed = (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0; boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0; dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr); ASSERT3P(buf->b_data, !=, NULL); IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf)); IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); IMPLY(encrypted, HDR_ENCRYPTED(hdr)); IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf)); IMPLY(encrypted, ARC_BUF_COMPRESSED(buf)); IMPLY(encrypted, !ARC_BUF_SHARED(buf)); /* * If the caller wanted encrypted data we just need to copy it from * b_rabd and potentially byteswap it. We won't be able to do any * further transforms on it. */ if (encrypted) { ASSERT(HDR_HAS_RABD(hdr)); abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd, HDR_GET_PSIZE(hdr)); goto byteswap; } /* * Adjust encrypted and authenticated headers to accommodate * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are * allowed to fail decryption due to keys not being loaded * without being marked as an IO error. */ if (HDR_PROTECTED(hdr)) { error = arc_fill_hdr_crypt(hdr, hash_lock, spa, zb, !!(flags & ARC_FILL_NOAUTH)); if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) { return (error); } else if (error != 0) { if (hash_lock != NULL) mutex_enter(hash_lock); arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hash_lock != NULL) mutex_exit(hash_lock); return (error); } } /* * There is a special case here for dnode blocks which are * decrypting their bonus buffers. These blocks may request to * be decrypted in-place. This is necessary because there may * be many dnodes pointing into this buffer and there is * currently no method to synchronize replacing the backing * b_data buffer and updating all of the pointers. Here we use * the hash lock to ensure there are no races. If the need * arises for other types to be decrypted in-place, they must * add handling here as well. */ if ((flags & ARC_FILL_IN_PLACE) != 0) { ASSERT(!hdr_compressed); ASSERT(!compressed); ASSERT(!encrypted); if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); if (hash_lock != NULL) mutex_enter(hash_lock); arc_buf_untransform_in_place(buf, hash_lock); if (hash_lock != NULL) mutex_exit(hash_lock); /* Compute the hdr's checksum if necessary */ arc_cksum_compute(buf); } return (0); } if (hdr_compressed == compressed) { if (!arc_buf_is_shared(buf)) { abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, arc_buf_size(buf)); } } else { ASSERT(hdr_compressed); ASSERT(!compressed); ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); /* * If the buf is sharing its data with the hdr, unlink it and * allocate a new data buffer for the buf. */ if (arc_buf_is_shared(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); /* We need to give the buf its own b_data */ buf->b_flags &= ~ARC_BUF_FLAG_SHARED; buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); /* Previously overhead was 0; just add new overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } else if (ARC_BUF_COMPRESSED(buf)) { /* We need to reallocate the buf's b_data */ arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), buf); buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); /* We increased the size of b_data; update overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); } /* * Regardless of the buf's previous compression settings, it * should not be compressed at the end of this function. */ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; /* * Try copying the data from another buf which already has a * decompressed version. If that's not possible, it's time to * bite the bullet and decompress the data from the hdr. */ if (arc_buf_try_copy_decompressed_data(buf)) { /* Skip byteswapping and checksumming (already done) */ return (0); } else { error = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, buf->b_data, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); /* * Absent hardware errors or software bugs, this should * be impossible, but log it anyway so we can debug it. */ if (error != 0) { zfs_dbgmsg( "hdr %px, compress %d, psize %d, lsize %d", hdr, arc_hdr_get_compress(hdr), HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); if (hash_lock != NULL) mutex_enter(hash_lock); arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hash_lock != NULL) mutex_exit(hash_lock); return (SET_ERROR(EIO)); } } } byteswap: /* Byteswap the buf's data if necessary */ if (bswap != DMU_BSWAP_NUMFUNCS) { ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); } /* Compute the hdr's checksum if necessary */ arc_cksum_compute(buf); return (0); } /* * If this function is being called to decrypt an encrypted buffer or verify an * authenticated one, the key must be loaded and a mapping must be made * available in the keystore via spa_keystore_create_mapping() or one of its * callers. */ int arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, boolean_t in_place) { int ret; arc_fill_flags_t flags = 0; if (in_place) flags |= ARC_FILL_IN_PLACE; ret = arc_buf_fill(buf, spa, zb, flags); if (ret == ECKSUM) { /* * Convert authentication and decryption errors to EIO * (and generate an ereport) before leaving the ARC. */ ret = SET_ERROR(EIO); spa_log_error(spa, zb); - zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, + (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0, 0); } return (ret); } /* * Increment the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); (void) zfs_refcount_add_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } ASSERT(!GHOST_STATE(state)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_add_many(&state->arcs_esize[type], HDR_GET_PSIZE(hdr), hdr); } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } } /* * Decrement the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); (void) zfs_refcount_remove_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } ASSERT(!GHOST_STATE(state)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_remove_many(&state->arcs_esize[type], HDR_GET_PSIZE(hdr), hdr); } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } } /* * Add a reference to this hdr indicating that someone is actively * referencing that memory. When the refcount transitions from 0 to 1, * we remove it from the respective arc_state_t list to indicate that * it is not evictable. */ static void add_reference(arc_buf_hdr_t *hdr, void *tag) { arc_state_t *state; ASSERT(HDR_HAS_L1HDR(hdr)); if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) { ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); } state = hdr->b_l1hdr.b_state; if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && (state != arc_anon)) { /* We don't use the L2-only state list. */ if (state != arc_l2c_only) { multilist_remove(state->arcs_list[arc_buf_type(hdr)], hdr); arc_evictable_space_decrement(hdr, state); } /* remove the prefetch flag if we get a reference */ arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); } } /* * Remove a reference from this hdr. When the reference transitions from * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's * list making it eligible for eviction. */ static int remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { int cnt; arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); /* * arc_l2c_only counts as a ghost state so we don't need to explicitly * check to prevent usage of the arc_l2c_only list. */ if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr); ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); arc_evictable_space_increment(hdr, state); } return (cnt); } /* * Returns detailed information about a specific arc buffer. When the * state_index argument is set the function will calculate the arc header * list position for its arc state. Since this requires a linear traversal * callers are strongly encourage not to do this. However, it can be helpful * for targeted analysis so the functionality is provided. */ void arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) { arc_buf_hdr_t *hdr = ab->b_hdr; l1arc_buf_hdr_t *l1hdr = NULL; l2arc_buf_hdr_t *l2hdr = NULL; arc_state_t *state = NULL; memset(abi, 0, sizeof (arc_buf_info_t)); if (hdr == NULL) return; abi->abi_flags = hdr->b_flags; if (HDR_HAS_L1HDR(hdr)) { l1hdr = &hdr->b_l1hdr; state = l1hdr->b_state; } if (HDR_HAS_L2HDR(hdr)) l2hdr = &hdr->b_l2hdr; if (l1hdr) { abi->abi_bufcnt = l1hdr->b_bufcnt; abi->abi_access = l1hdr->b_arc_access; abi->abi_mru_hits = l1hdr->b_mru_hits; abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; abi->abi_mfu_hits = l1hdr->b_mfu_hits; abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits; abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt); } if (l2hdr) { abi->abi_l2arc_dattr = l2hdr->b_daddr; abi->abi_l2arc_hits = l2hdr->b_hits; } abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; abi->abi_state_contents = arc_buf_type(hdr); abi->abi_size = arc_hdr_size(hdr); } /* * Move the supplied buffer to the indicated state. The hash lock * for the buffer must be held by the caller. */ static void arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { arc_state_t *old_state; int64_t refcnt; uint32_t bufcnt; boolean_t update_old, update_new; arc_buf_contents_t buftype = arc_buf_type(hdr); /* * We almost always have an L1 hdr here, since we call arc_hdr_realloc() * in arc_read() when bringing a buffer out of the L2ARC. However, the * L1 hdr doesn't always exist when we change state to arc_anon before * destroying a header, in which case reallocating to add the L1 hdr is * pointless. */ if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt); bufcnt = hdr->b_l1hdr.b_bufcnt; update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); } else { old_state = arc_l2c_only; refcnt = 0; bufcnt = 0; update_old = B_FALSE; } update_new = update_old; ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); ASSERT(old_state != arc_anon || bufcnt <= 1); /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); multilist_remove(old_state->arcs_list[buftype], hdr); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); update_old = B_TRUE; } arc_evictable_space_decrement(hdr, old_state); } if (new_state != arc_anon && new_state != arc_l2c_only) { /* * An L1 header always exists here, since if we're * moving to some L1-cached state (i.e. not l2c_only or * anonymous), we realloc the header to add an L1hdr * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(new_state->arcs_list[buftype], hdr); if (GHOST_STATE(new_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); update_new = B_TRUE; } arc_evictable_space_increment(hdr, new_state); } } ASSERT(!HDR_EMPTY(hdr)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); /* adjust state sizes (ignore arc_l2c_only) */ if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { ASSERT0(bufcnt); /* * When moving a header to a ghost state, we first * remove all arc buffers. Thus, we'll have a * bufcnt of zero, and no arc buffer to use for * the reference. As a result, we use the arc * header pointer for the reference. */ (void) zfs_refcount_add_many(&new_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); } else { uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { ASSERT3U(bufcnt, !=, 0); buffers++; /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_add_many( &new_state->arcs_size, arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many( &new_state->arcs_size, arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_add_many( &new_state->arcs_size, HDR_GET_PSIZE(hdr), hdr); } } } if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); /* * When moving a header off of a ghost state, * the header will not contain any arc buffers. * We use the arc header pointer for the reference * which is exactly what we did when we put the * header on the ghost state. */ (void) zfs_refcount_remove_many(&old_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); } else { uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { ASSERT3U(bufcnt, !=, 0); buffers++; /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_remove_many( &old_state->arcs_size, arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many( &old_state->arcs_size, arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_remove_many( &old_state->arcs_size, HDR_GET_PSIZE(hdr), hdr); } } } if (HDR_HAS_L1HDR(hdr)) hdr->b_l1hdr.b_state = new_state; /* * L2 headers should never be on the L2 state list since they don't * have L1 headers allocated. */ ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); } void arc_space_consume(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { default: break; case ARC_SPACE_DATA: aggsum_add(&astat_data_size, space); break; case ARC_SPACE_META: aggsum_add(&astat_metadata_size, space); break; case ARC_SPACE_BONUS: aggsum_add(&astat_bonus_size, space); break; case ARC_SPACE_DNODE: aggsum_add(&astat_dnode_size, space); break; case ARC_SPACE_DBUF: aggsum_add(&astat_dbuf_size, space); break; case ARC_SPACE_HDRS: aggsum_add(&astat_hdr_size, space); break; case ARC_SPACE_L2HDRS: aggsum_add(&astat_l2_hdr_size, space); break; case ARC_SPACE_ABD_CHUNK_WASTE: /* * Note: this includes space wasted by all scatter ABD's, not * just those allocated by the ARC. But the vast majority of * scatter ABD's come from the ARC, because other users are * very short-lived. */ aggsum_add(&astat_abd_chunk_waste_size, space); break; } if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) aggsum_add(&arc_meta_used, space); aggsum_add(&arc_size, space); } void arc_space_return(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { default: break; case ARC_SPACE_DATA: aggsum_add(&astat_data_size, -space); break; case ARC_SPACE_META: aggsum_add(&astat_metadata_size, -space); break; case ARC_SPACE_BONUS: aggsum_add(&astat_bonus_size, -space); break; case ARC_SPACE_DNODE: aggsum_add(&astat_dnode_size, -space); break; case ARC_SPACE_DBUF: aggsum_add(&astat_dbuf_size, -space); break; case ARC_SPACE_HDRS: aggsum_add(&astat_hdr_size, -space); break; case ARC_SPACE_L2HDRS: aggsum_add(&astat_l2_hdr_size, -space); break; case ARC_SPACE_ABD_CHUNK_WASTE: aggsum_add(&astat_abd_chunk_waste_size, -space); break; } if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) { ASSERT(aggsum_compare(&arc_meta_used, space) >= 0); /* * We use the upper bound here rather than the precise value * because the arc_meta_max value doesn't need to be * precise. It's only consumed by humans via arcstats. */ if (arc_meta_max < aggsum_upper_bound(&arc_meta_used)) arc_meta_max = aggsum_upper_bound(&arc_meta_used); aggsum_add(&arc_meta_used, -space); } ASSERT(aggsum_compare(&arc_size, space) >= 0); aggsum_add(&arc_size, -space); } /* * Given a hdr and a buf, returns whether that buf can share its b_data buffer * with the hdr's b_pabd. */ static boolean_t arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) { /* * The criteria for sharing a hdr's data are: * 1. the buffer is not encrypted * 2. the hdr's compression matches the buf's compression * 3. the hdr doesn't need to be byteswapped * 4. the hdr isn't already being shared * 5. the buf is either compressed or it is the last buf in the hdr list * * Criterion #5 maintains the invariant that shared uncompressed * bufs must be the final buf in the hdr's b_buf list. Reading this, you * might ask, "if a compressed buf is allocated first, won't that be the * last thing in the list?", but in that case it's impossible to create * a shared uncompressed buf anyway (because the hdr must be compressed * to have the compressed buf). You might also think that #3 is * sufficient to make this guarantee, however it's possible * (specifically in the rare L2ARC write race mentioned in * arc_buf_alloc_impl()) there will be an existing uncompressed buf that * is shareable, but wasn't at the time of its allocation. Rather than * allow a new shared uncompressed buf to be created and then shuffle * the list around to make it the last element, this simply disallows * sharing if the new buf isn't the first to be added. */ ASSERT3P(buf->b_hdr, ==, hdr); boolean_t hdr_compressed = arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF; boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; return (!ARC_BUF_ENCRYPTED(buf) && buf_compressed == hdr_compressed && hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && !HDR_SHARED_DATA(hdr) && (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); } /* * Allocate a buf for this hdr. If you care about the data that's in the hdr, * or if you want a compressed buffer, pass those flags in. Returns 0 if the * copy was made successfully, or an error code otherwise. */ static int arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth, boolean_t fill, arc_buf_t **ret) { arc_buf_t *buf; arc_fill_flags_t flags = ARC_FILL_LOCKED; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); VERIFY(hdr->b_type == ARC_BUFC_DATA || hdr->b_type == ARC_BUFC_METADATA); ASSERT3P(ret, !=, NULL); ASSERT3P(*ret, ==, NULL); IMPLY(encrypted, compressed); hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; hdr->b_l1hdr.b_l2_hits = 0; buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_next = hdr->b_l1hdr.b_buf; buf->b_flags = 0; add_reference(hdr, tag); /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Only honor requests for compressed bufs if the hdr is actually * compressed. This must be overridden if the buffer is encrypted since * encrypted buffers cannot be decompressed. */ if (encrypted) { buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED; flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED; } else if (compressed && arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; flags |= ARC_FILL_COMPRESSED; } if (noauth) { ASSERT0(encrypted); flags |= ARC_FILL_NOAUTH; } /* * If the hdr's data can be shared then we share the data buffer and * set the appropriate bit in the hdr's b_flags to indicate the hdr is * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new * buffer to store the buf's data. * * There are two additional restrictions here because we're sharing * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be * actively involved in an L2ARC write, because if this buf is used by * an arc_write() then the hdr's data buffer will be released when the * write completes, even though the L2ARC write might still be using it. * Second, the hdr's ABD must be linear so that the buf's user doesn't * need to be ABD-aware. It must be allocated via * zio_[data_]buf_alloc(), not as a page, because we need to be able * to abd_release_ownership_of_buf(), which isn't allowed on "linear * page" buffers because the ABD code needs to handle freeing them * specially. */ boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd) && !abd_is_linear_page(hdr->b_l1hdr.b_pabd); /* Set up b_data and sharing */ if (can_share) { buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); buf->b_flags |= ARC_BUF_FLAG_SHARED; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); } else { buf->b_data = arc_get_data_buf(hdr, arc_buf_size(buf), buf); ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; hdr->b_l1hdr.b_bufcnt += 1; if (encrypted) hdr->b_crypt_hdr.b_ebufcnt += 1; /* * If the user wants the data from the hdr, we need to either copy or * decompress the data. */ if (fill) { ASSERT3P(zb, !=, NULL); return (arc_buf_fill(buf, spa, zb, flags)); } return (0); } static char *arc_onloan_tag = "onloan"; static inline void arc_loaned_bytes_update(int64_t delta) { atomic_add_64(&arc_loaned_bytes, delta); /* assert that it did not wrap around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); } /* * Loan out an anonymous arc buffer. Loaned buffers are not counted as in * flight data by arc_tempreserve_space() until they are "returned". Loaned * buffers must be returned to the arc before they can be used by the DMU or * freed. */ arc_buf_t * arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) { arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); arc_loaned_bytes_update(arc_buf_size(buf)); return (buf); } arc_buf_t * arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, psize, lsize, compression_type, complevel); arc_loaned_bytes_update(arc_buf_size(buf)); return (buf); } arc_buf_t * arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj, byteorder, salt, iv, mac, ot, psize, lsize, compression_type, complevel); atomic_add_64(&arc_loaned_bytes, psize); return (buf); } /* * Return a loaned arc buffer to the arc. */ void arc_return_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); arc_loaned_bytes_update(-arc_buf_size(buf)); } /* Detach an arc_buf from a dbuf (tag) */ void arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); arc_loaned_bytes_update(arc_buf_size(buf)); } static void l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) { l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); df->l2df_abd = abd; df->l2df_size = size; df->l2df_type = type; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); } static void arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, hdr); } (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_space_return(size, ARC_SPACE_DATA); } if (free_rdata) { l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type); } else { l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); } } /* * Share the arc_buf_t's data with the hdr. Whenever we are sharing the * data buffer, we transfer the refcount ownership to the hdr and update * the appropriate kstats. */ static void arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_can_share(hdr, buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Start sharing the data buffer. We transfer the * refcount ownership to the hdr since it always owns * the refcount whenever an arc_buf_t is shared. */ zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, arc_hdr_size(hdr), buf, hdr); hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, HDR_ISTYPE_METADATA(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); buf->b_flags |= ARC_BUF_FLAG_SHARED; /* * Since we've transferred ownership to the hdr we need * to increment its compressed and uncompressed kstats and * decrement the overhead size. */ ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); } static void arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * We are no longer sharing this buffer so we need * to transfer its ownership to the rightful owner. */ zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, arc_hdr_size(hdr), hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); abd_put(hdr->b_l1hdr.b_pabd); hdr->b_l1hdr.b_pabd = NULL; buf->b_flags &= ~ARC_BUF_FLAG_SHARED; /* * Since the buffer is no longer shared between * the arc buf and the hdr, count it as overhead. */ ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } /* * Remove an arc_buf_t from the hdr's buf list and return the last * arc_buf_t on the list. If no buffers remain on the list then return * NULL. */ static arc_buf_t * arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; arc_buf_t *lastbuf = NULL; /* * Remove the buf from the hdr list and locate the last * remaining buffer on the list. */ while (*bufp != NULL) { if (*bufp == buf) *bufp = buf->b_next; /* * If we've removed a buffer in the middle of * the list then update the lastbuf and update * bufp. */ if (*bufp != NULL) { lastbuf = *bufp; bufp = &(*bufp)->b_next; } } buf->b_next = NULL; ASSERT3P(lastbuf, !=, buf); IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); return (lastbuf); } /* * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's * list and free it. */ static void arc_buf_destroy_impl(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * Free up the data associated with the buf but only if we're not * sharing this with the hdr. If we are sharing it with the hdr, the * hdr is responsible for doing the free. */ if (buf->b_data != NULL) { /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); arc_cksum_verify(buf); arc_buf_unwatch(buf); if (arc_buf_is_shared(buf)) { arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { uint64_t size = arc_buf_size(buf); arc_free_data_buf(hdr, buf->b_data, size, buf); ARCSTAT_INCR(arcstat_overhead_size, -size); } buf->b_data = NULL; ASSERT(hdr->b_l1hdr.b_bufcnt > 0); hdr->b_l1hdr.b_bufcnt -= 1; if (ARC_BUF_ENCRYPTED(buf)) { hdr->b_crypt_hdr.b_ebufcnt -= 1; /* * If we have no more encrypted buffers and we've * already gotten a copy of the decrypted data we can * free b_rabd to save some space. */ if (hdr->b_crypt_hdr.b_ebufcnt == 0 && HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) { arc_hdr_free_abd(hdr, B_TRUE); } } } arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { /* * If the current arc_buf_t is sharing its data buffer with the * hdr, then reassign the hdr's b_pabd to share it with the new * buffer at the end of the list. The shared buffer is always * the last one on the hdr's buffer list. * * There is an equivalent case for compressed bufs, but since * they aren't guaranteed to be the last buf in the list and * that is an exceedingly rare case, we just allow that space be * wasted temporarily. We must also be careful not to share * encrypted buffers, since they cannot be shared. */ if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) { /* Only one buf can be shared at once */ VERIFY(!arc_buf_is_shared(lastbuf)); /* hdr is uncompressed so can't have compressed buf */ VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); arc_hdr_free_abd(hdr, B_FALSE); /* * We must setup a new shared block between the * last buffer and the hdr. The data would have * been allocated by the arc buf so we need to transfer * ownership to the hdr since it's now being shared. */ arc_share_buf(hdr, lastbuf); } } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end * of the list. Compressed buffers don't have the * same requirements. This makes it hard to * simply assert that the lastbuf is shared so * we rely on the hdr's compression flags to determine * if we have a compressed, shared buffer. */ ASSERT3P(lastbuf, !=, NULL); ASSERT(arc_buf_is_shared(lastbuf) || arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); } /* * Free the checksum if we're removing the last uncompressed buf from * this hdr. */ if (!arc_hdr_has_uncompressed_buf(hdr)) { arc_cksum_free(hdr); } /* clean up the buf */ buf->b_hdr = NULL; kmem_cache_free(buf_cache, buf); } static void arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags) { uint64_t size; boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0); boolean_t do_adapt = ((alloc_flags & ARC_HDR_DO_ADAPT) != 0); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata); IMPLY(alloc_rdata, HDR_PROTECTED(hdr)); if (alloc_rdata) { size = HDR_GET_PSIZE(hdr); ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL); hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr, do_adapt); ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL); ARCSTAT_INCR(arcstat_raw_size, size); } else { size = arc_hdr_size(hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr, do_adapt); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); } ARCSTAT_INCR(arcstat_compressed_size, size); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); } static void arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata) { uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); IMPLY(free_rdata, HDR_HAS_RABD(hdr)); /* * If the hdr is currently being written to the l2arc then * we defer freeing the data by adding it to the l2arc_free_on_write * list. The l2arc will free the data once it's finished * writing it to the l2arc device. */ if (HDR_L2_WRITING(hdr)) { arc_hdr_free_on_write(hdr, free_rdata); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else if (free_rdata) { arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr); } else { arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr); } if (free_rdata) { hdr->b_crypt_hdr.b_rabd = NULL; ARCSTAT_INCR(arcstat_raw_size, -size); } else { hdr->b_l1hdr.b_pabd = NULL; } if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr)) hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; ARCSTAT_INCR(arcstat_compressed_size, -size); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); } static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, boolean_t protected, enum zio_compress compression_type, uint8_t complevel, arc_buf_contents_t type, boolean_t alloc_rdata) { arc_buf_hdr_t *hdr; int flags = ARC_HDR_DO_ADAPT; VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); if (protected) { hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE); } else { hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); } flags |= alloc_rdata ? ARC_HDR_ALLOC_RDATA : 0; ASSERT(HDR_EMPTY(hdr)); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); HDR_SET_PSIZE(hdr, psize); HDR_SET_LSIZE(hdr, lsize); hdr->b_spa = spa; hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); arc_hdr_set_compress(hdr, compression_type); hdr->b_complevel = complevel; if (protected) arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_buf = NULL; /* * Allocate the hdr's buffer. This will contain either * the compressed or uncompressed data depending on the block * it references and compressed arc enablement. */ arc_hdr_alloc_abd(hdr, flags); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); } /* * Transition between the two allocation states for the arc_buf_hdr struct. * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller * version is used when a cache buffer is only in the L2ARC in order to reduce * memory usage. */ static arc_buf_hdr_t * arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) { ASSERT(HDR_HAS_L2HDR(hdr)); arc_buf_hdr_t *nhdr; l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || (old == hdr_l2only_cache && new == hdr_full_cache)); /* * if the caller wanted a new full header and the header is to be * encrypted we will actually allocate the header from the full crypt * cache instead. The same applies to freeing from the old cache. */ if (HDR_PROTECTED(hdr) && new == hdr_full_cache) new = hdr_full_crypt_cache; if (HDR_PROTECTED(hdr) && old == hdr_full_cache) old = hdr_full_crypt_cache; nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); buf_hash_remove(hdr); bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); if (new == hdr_full_cache || new == hdr_full_crypt_cache) { arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); /* * arc_access and arc_change_state need to be aware that a * header has just come out of L2ARC, so we set its state to * l2c_only even though it's about to change. */ nhdr->b_l1hdr.b_state = arc_l2c_only; /* Verify previous threads set to NULL before freeing */ ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); /* * If we've reached here, We must have been called from * arc_evict_hdr(), as such we should have already been * removed from any ghost list we were previously on * (which protects us from racing with arc_evict_state), * thus no locking is needed during this check. */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); /* * A buffer must not be moved into the arc_l2c_only * state if it's not finished being written out to the * l2arc device. Otherwise, the b_l1hdr.b_pabd field * might try to be accessed, even though it was removed. */ VERIFY(!HDR_L2_WRITING(hdr)); VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); } /* * The header has been reallocated so we need to re-insert it into any * lists it was on. */ (void) buf_hash_insert(nhdr, NULL); ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); mutex_enter(&dev->l2ad_mtx); /* * We must place the realloc'ed header back into the list at * the same spot. Otherwise, if it's placed earlier in the list, * l2arc_write_buffers() could find it during the function's * write phase, and try to write it out to the l2arc. */ list_insert_after(&dev->l2ad_buflist, hdr, nhdr); list_remove(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); /* * Since we're using the pointer address as the tag when * incrementing and decrementing the l2ad_alloc refcount, we * must remove the old pointer (that we're about to destroy) and * add the new pointer to the refcount. Otherwise we'd remove * the wrong pointer address when calling arc_hdr_destroy() later. */ (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); buf_discard_identity(hdr); kmem_cache_free(old, hdr); return (nhdr); } /* * This function allows an L1 header to be reallocated as a crypt * header and vice versa. If we are going to a crypt header, the * new fields will be zeroed out. */ static arc_buf_hdr_t * arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) { arc_buf_hdr_t *nhdr; arc_buf_t *buf; kmem_cache_t *ncache, *ocache; unsigned nsize, osize; /* * This function requires that hdr is in the arc_anon state. * Therefore it won't have any L2ARC data for us to worry * about copying. */ ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node)); ASSERT3P(hdr->b_hash_next, ==, NULL); if (need_crypt) { ncache = hdr_full_crypt_cache; nsize = sizeof (hdr->b_crypt_hdr); ocache = hdr_full_cache; osize = HDR_FULL_SIZE; } else { ncache = hdr_full_cache; nsize = HDR_FULL_SIZE; ocache = hdr_full_crypt_cache; osize = sizeof (hdr->b_crypt_hdr); } nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE); /* * Copy all members that aren't locks or condvars to the new header. * No lists are pointing to us (as we asserted above), so we don't * need to worry about the list nodes. */ nhdr->b_dva = hdr->b_dva; nhdr->b_birth = hdr->b_birth; nhdr->b_type = hdr->b_type; nhdr->b_flags = hdr->b_flags; nhdr->b_psize = hdr->b_psize; nhdr->b_lsize = hdr->b_lsize; nhdr->b_spa = hdr->b_spa; nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum; nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt; nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap; nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state; nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access; nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits; nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits; nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; nhdr->b_l1hdr.b_l2_hits = hdr->b_l1hdr.b_l2_hits; nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb; nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd; /* * This zfs_refcount_add() exists only to ensure that the individual * arc buffers always point to a header that is referenced, avoiding * a small race condition that could trigger ASSERTs. */ (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG); nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf; for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { mutex_enter(&buf->b_evict_lock); buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); } zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt); (void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG); ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); if (need_crypt) { arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED); } else { arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED); } /* unset all members of the original hdr */ bzero(&hdr->b_dva, sizeof (dva_t)); hdr->b_birth = 0; hdr->b_type = ARC_BUFC_INVALID; hdr->b_flags = 0; hdr->b_psize = 0; hdr->b_lsize = 0; hdr->b_spa = 0; hdr->b_l1hdr.b_freeze_cksum = NULL; hdr->b_l1hdr.b_buf = NULL; hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_byteswap = 0; hdr->b_l1hdr.b_state = NULL; hdr->b_l1hdr.b_arc_access = 0; hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; hdr->b_l1hdr.b_l2_hits = 0; hdr->b_l1hdr.b_acb = NULL; hdr->b_l1hdr.b_pabd = NULL; if (ocache == hdr_full_crypt_cache) { ASSERT(!HDR_HAS_RABD(hdr)); hdr->b_crypt_hdr.b_ot = DMU_OT_NONE; hdr->b_crypt_hdr.b_ebufcnt = 0; hdr->b_crypt_hdr.b_dsobj = 0; bzero(hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); bzero(hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); bzero(hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); } buf_discard_identity(hdr); kmem_cache_free(ocache, hdr); return (nhdr); } /* * This function is used by the send / receive code to convert a newly * allocated arc_buf_t to one that is suitable for a raw encrypted write. It * is also used to allow the root objset block to be updated without altering * its embedded MACs. Both block types will always be uncompressed so we do not * have to worry about compression type or psize. */ void arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED); if (!HDR_PROTECTED(hdr)) hdr = arc_hdr_realloc_crypt(hdr, B_TRUE); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); if (!arc_hdr_has_uncompressed_buf(hdr)) arc_cksum_free(hdr); if (salt != NULL) bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); if (iv != NULL) bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); if (mac != NULL) bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); } /* * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. * The buf is returned thawed since we expect the consumer to modify it. */ arc_buf_t * arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, B_FALSE, ZIO_COMPRESS_OFF, 0, type, B_FALSE); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); return (buf); } /* * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this * for bufs containing metadata. */ arc_buf_t * arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF); ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_FALSE, compression_type, complevel, ARC_BUFC_DATA, B_FALSE); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_TRUE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); if (!arc_buf_is_shared(buf)) { /* * To ensure that the hdr has the correct data in it if we call * arc_untransform() on this buf before it's been written to * disk, it's easiest if we just set up sharing between the * buf and the hdr. */ arc_hdr_free_abd(hdr, B_FALSE); arc_share_buf(hdr, buf); } return (buf); } arc_buf_t * arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_hdr_t *hdr; arc_buf_t *buf; arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ? ARC_BUFC_METADATA : ARC_BUFC_DATA; ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF); ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, compression_type, complevel, type, B_TRUE); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); /* * This buffer will be considered encrypted even if the ot is not an * encrypted type. It will become authenticated instead in * arc_write_ready(). */ buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); return (buf); } static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); ASSERT(HDR_HAS_L2HDR(hdr)); list_remove(&dev->l2ad_buflist, hdr); ARCSTAT_INCR(arcstat_l2_psize, -psize); ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); } static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_buf == NULL || hdr->b_l1hdr.b_bufcnt > 0); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); if (!buflist_held) mutex_enter(&dev->l2ad_mtx); /* * Even though we checked this conditional above, we * need to check this again now that we have the * l2ad_mtx. This is because we could be racing with * another thread calling l2arc_evict() which might have * destroyed this header's L2 portion as we were waiting * to acquire the l2ad_mtx. If that happens, we don't * want to re-destroy the header's L2 portion. */ if (HDR_HAS_L2HDR(hdr)) arc_hdr_l2hdr_destroy(hdr); if (!buflist_held) mutex_exit(&dev->l2ad_mtx); } /* * The header's identify can only be safely discarded once it is no * longer discoverable. This requires removing it from the hash table * and the l2arc header list. After this point the hash lock can not * be used to protect the header. */ if (!HDR_EMPTY(hdr)) buf_discard_identity(hdr); if (HDR_HAS_L1HDR(hdr)) { arc_cksum_free(hdr); while (hdr->b_l1hdr.b_buf != NULL) arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); if (hdr->b_l1hdr.b_pabd != NULL) arc_hdr_free_abd(hdr, B_FALSE); if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); } ASSERT3P(hdr->b_hash_next, ==, NULL); if (HDR_HAS_L1HDR(hdr)) { ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (!HDR_PROTECTED(hdr)) { kmem_cache_free(hdr_full_cache, hdr); } else { kmem_cache_free(hdr_full_crypt_cache, hdr); } } else { kmem_cache_free(hdr_l2only_cache, hdr); } } void arc_buf_destroy(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); VERIFY0(remove_reference(hdr, NULL, tag)); arc_hdr_destroy(hdr); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT3P(hdr, ==, buf->b_hdr); ASSERT(hdr->b_l1hdr.b_bufcnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); ASSERT3P(buf->b_data, !=, NULL); (void) remove_reference(hdr, hash_lock, tag); arc_buf_destroy_impl(buf); mutex_exit(hash_lock); } /* * Evict the arc_buf_hdr that is provided as a parameter. The resultant * state of the header is dependent on its state prior to entering this * function. The following transitions are possible: * * - arc_mru -> arc_mru_ghost * - arc_mfu -> arc_mfu_ghost * - arc_mru_ghost -> arc_l2c_only * - arc_mru_ghost -> deleted * - arc_mfu_ghost -> arc_l2c_only * - arc_mfu_ghost -> deleted */ static int64_t arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? arc_min_prescient_prefetch_ms : arc_min_prefetch_ms; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* * l2arc_write_buffers() relies on a header's L1 portion * (i.e. its b_pabd field) during it's write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing its L1 piece) until the header is * done being written to the l2arc. */ if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { ARCSTAT_BUMP(arcstat_evict_l2_skip); return (bytes_evicted); } ARCSTAT_BUMP(arcstat_deleted); bytes_evicted += HDR_GET_LSIZE(hdr); DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); if (HDR_HAS_L2HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_pabd == NULL); ASSERT(!HDR_HAS_RABD(hdr)); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ arc_change_state(arc_l2c_only, hdr, hash_lock); /* * dropping from L1+L2 cached to L2-only, * realloc to remove the L1 header. */ hdr = arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); } else { arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } return (bytes_evicted); } ASSERT(state == arc_mru || state == arc_mfu); evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(hdr) || ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < MSEC_TO_TICK(min_lifetime))) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); while (hdr->b_l1hdr.b_buf) { arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { ARCSTAT_BUMP(arcstat_mutex_miss); break; } if (buf->b_data != NULL) bytes_evicted += HDR_GET_LSIZE(hdr); mutex_exit(&buf->b_evict_lock); arc_buf_destroy_impl(buf); } if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { if (l2arc_write_eligible(hdr->b_spa, hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, HDR_GET_LSIZE(hdr)); } else { ARCSTAT_INCR(arcstat_evict_l2_ineligible, HDR_GET_LSIZE(hdr)); } } if (hdr->b_l1hdr.b_bufcnt == 0) { arc_cksum_free(hdr); bytes_evicted += arc_hdr_size(hdr); /* * If this hdr is being evicted and has a compressed * buffer then we discard it here before we change states. * This ensures that the accounting is updated correctly * in arc_free_data_impl(). */ if (hdr->b_l1hdr.b_pabd != NULL) arc_hdr_free_abd(hdr, B_FALSE); if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); } return (bytes_evicted); } static void arc_set_need_free(void) { ASSERT(MUTEX_HELD(&arc_evict_lock)); int64_t remaining = arc_free_memory() - arc_sys_free / 2; arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters); if (aw == NULL) { arc_need_free = MAX(-remaining, 0); } else { arc_need_free = MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count)); } } static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, uint64_t spa, int64_t bytes) { multilist_sublist_t *mls; uint64_t bytes_evicted = 0; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; int evict_count = 0; ASSERT3P(marker, !=, NULL); IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); mls = multilist_sublist_lock(ml, idx); for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; hdr = multilist_sublist_prev(mls, marker)) { if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || (evict_count >= zfs_arc_evict_batch_limit)) break; /* * To keep our iteration location, move the marker * forward. Since we're not holding hdr's hash lock, we * must be very careful and not remove 'hdr' from the * sublist. Otherwise, other consumers might mistake the * 'hdr' as not being on a sublist when they call the * multilist_link_active() function (they all rely on * the hash lock protecting concurrent insertions and * removals). multilist_sublist_move_forward() was * specifically implemented to ensure this is the case * (only 'marker' will be removed and re-inserted). */ multilist_sublist_move_forward(mls, marker); /* * The only case where the b_spa field should ever be * zero, is the marker headers inserted by * arc_evict_state(). It's possible for multiple threads * to be calling arc_evict_state() concurrently (e.g. * dsl_pool_close() and zio_inject_fault()), so we must * skip any markers we see from these other threads. */ if (hdr->b_spa == 0) continue; /* we're only interested in evicting buffers of a certain spa */ if (spa != 0 && hdr->b_spa != spa) { ARCSTAT_BUMP(arcstat_evict_skip); continue; } hash_lock = HDR_LOCK(hdr); /* * We aren't calling this function from any code path * that would already be holding a hash lock, so we're * asserting on this assumption to be defensive in case * this ever changes. Without this check, it would be * possible to incorrectly increment arcstat_mutex_miss * below (e.g. if the code changed such that we called * this function with a hash lock held). */ ASSERT(!MUTEX_HELD(hash_lock)); if (mutex_tryenter(hash_lock)) { uint64_t evicted = arc_evict_hdr(hdr, hash_lock); mutex_exit(hash_lock); bytes_evicted += evicted; /* * If evicted is zero, arc_evict_hdr() must have * decided to skip this header, don't increment * evict_count in this case. */ if (evicted != 0) evict_count++; } else { ARCSTAT_BUMP(arcstat_mutex_miss); } } multilist_sublist_unlock(mls); /* * Increment the count of evicted bytes, and wake up any threads that * are waiting for the count to reach this value. Since the list is * ordered by ascending aew_count, we pop off the beginning of the * list until we reach the end, or a waiter that's past the current * "count". Doing this outside the loop reduces the number of times * we need to acquire the global arc_evict_lock. * * Only wake when there's sufficient free memory in the system * (specifically, arc_sys_free/2, which by default is a bit more than * 1/64th of RAM). See the comments in arc_wait_for_eviction(). */ mutex_enter(&arc_evict_lock); arc_evict_count += bytes_evicted; if ((int64_t)(arc_free_memory() - arc_sys_free / 2) > 0) { arc_evict_waiter_t *aw; while ((aw = list_head(&arc_evict_waiters)) != NULL && aw->aew_count <= arc_evict_count) { list_remove(&arc_evict_waiters, aw); cv_broadcast(&aw->aew_cv); } } arc_set_need_free(); mutex_exit(&arc_evict_lock); /* * If the ARC size is reduced from arc_c_max to arc_c_min (especially * if the average cached block is small), eviction can be on-CPU for * many seconds. To ensure that other threads that may be bound to * this CPU are able to make progress, make a voluntary preemption * call here. */ cond_resched(); return (bytes_evicted); } /* * Evict buffers from the given arc state, until we've removed the * specified number of bytes. Move the removed buffers to the * appropriate evict state. * * This function makes a "best effort". It skips over any buffers * it can't get a hash_lock on, and so, may not catch all candidates. * It may also return without evicting as much space as requested. * * If bytes is specified using the special value ARC_EVICT_ALL, this * will evict all available (i.e. unlocked and evictable) buffers from * the given arc state; which is used by arc_flush(). */ static uint64_t arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type) { uint64_t total_evicted = 0; multilist_t *ml = state->arcs_list[type]; int num_sublists; arc_buf_hdr_t **markers; IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); num_sublists = multilist_get_num_sublists(ml); /* * If we've tried to evict from each sublist, made some * progress, but still have not hit the target number of bytes * to evict, we want to keep trying. The markers allow us to * pick up where we left off for each individual sublist, rather * than starting from the tail each time. */ markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls; markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); /* * A b_spa of 0 is used to indicate that this header is * a marker. This fact is used in arc_evict_type() and * arc_evict_state_impl(). */ markers[i]->b_spa = 0; mls = multilist_sublist_lock(ml, i); multilist_sublist_insert_tail(mls, markers[i]); multilist_sublist_unlock(mls); } /* * While we haven't hit our target number of bytes to evict, or * we're evicting all available buffers. */ while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { int sublist_idx = multilist_get_random_index(ml); uint64_t scan_evicted = 0; /* * Try to reduce pinned dnodes with a floor of arc_dnode_limit. * Request that 10% of the LRUs be scanned by the superblock * shrinker. */ if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size, arc_dnode_size_limit) > 0) { arc_prune_async((aggsum_upper_bound(&astat_dnode_size) - arc_dnode_size_limit) / sizeof (dnode_t) / zfs_arc_dnode_reduce_percent); } /* * Start eviction using a randomly selected sublist, * this is to try and evenly balance eviction across all * sublists. Always starting at the same sublist * (e.g. index 0) would cause evictions to favor certain * sublists over others. */ for (int i = 0; i < num_sublists; i++) { uint64_t bytes_remaining; uint64_t bytes_evicted; if (bytes == ARC_EVICT_ALL) bytes_remaining = ARC_EVICT_ALL; else if (total_evicted < bytes) bytes_remaining = bytes - total_evicted; else break; bytes_evicted = arc_evict_state_impl(ml, sublist_idx, markers[sublist_idx], spa, bytes_remaining); scan_evicted += bytes_evicted; total_evicted += bytes_evicted; /* we've reached the end, wrap to the beginning */ if (++sublist_idx >= num_sublists) sublist_idx = 0; } /* * If we didn't evict anything during this scan, we have * no reason to believe we'll evict more during another * scan, so break the loop. */ if (scan_evicted == 0) { /* This isn't possible, let's make that obvious */ ASSERT3S(bytes, !=, 0); /* * When bytes is ARC_EVICT_ALL, the only way to * break the loop is when scan_evicted is zero. * In that case, we actually have evicted enough, * so we don't want to increment the kstat. */ if (bytes != ARC_EVICT_ALL) { ASSERT3S(total_evicted, <, bytes); ARCSTAT_BUMP(arcstat_evict_not_enough); } break; } } for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls = multilist_sublist_lock(ml, i); multilist_sublist_remove(mls, markers[i]); multilist_sublist_unlock(mls); kmem_cache_free(hdr_full_cache, markers[i]); } kmem_free(markers, sizeof (*markers) * num_sublists); return (total_evicted); } /* * Flush all "evictable" data of the given type from the arc state * specified. This will not evict any "active" buffers (i.e. referenced). * * When 'retry' is set to B_FALSE, the function will make a single pass * over the state and evict any buffers that it can. Since it doesn't * continually retry the eviction, it might end up leaving some buffers * in the ARC due to lock misses. * * When 'retry' is set to B_TRUE, the function will continually retry the * eviction until *all* evictable buffers have been removed from the * state. As a result, if concurrent insertions into the state are * allowed (e.g. if the ARC isn't shutting down), this function might * wind up in an infinite loop, continually trying to evict buffers. */ static uint64_t arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, boolean_t retry) { uint64_t evicted = 0; while (zfs_refcount_count(&state->arcs_esize[type]) != 0) { evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); if (!retry) break; } return (evicted); } /* * Evict the specified number of bytes from the state specified, * restricting eviction to the spa and type given. This function * prevents us from trying to evict more from a state's list than * is "evictable", and to skip evicting altogether when passed a * negative value for "bytes". In contrast, arc_evict_state() will * evict everything it can, when passed a negative value for "bytes". */ static uint64_t arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type) { int64_t delta; if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&state->arcs_esize[type]), bytes); return (arc_evict_state(state, spa, delta, type)); } return (0); } /* * The goal of this function is to evict enough meta data buffers from the * ARC in order to enforce the arc_meta_limit. Achieving this is slightly * more complicated than it appears because it is common for data buffers * to have holds on meta data buffers. In addition, dnode meta data buffers * will be held by the dnodes in the block preventing them from being freed. * This means we can't simply traverse the ARC and expect to always find * enough unheld meta data buffer to release. * * Therefore, this function has been updated to make alternating passes * over the ARC releasing data buffers and then newly unheld meta data * buffers. This ensures forward progress is maintained and meta_used * will decrease. Normally this is sufficient, but if required the ARC * will call the registered prune callbacks causing dentry and inodes to * be dropped from the VFS cache. This will make dnode meta data buffers * available for reclaim. */ static uint64_t arc_evict_meta_balanced(uint64_t meta_used) { int64_t delta, prune = 0, adjustmnt; uint64_t total_evicted = 0; arc_buf_contents_t type = ARC_BUFC_DATA; int restarts = MAX(zfs_arc_meta_adjust_restarts, 0); restart: /* * This slightly differs than the way we evict from the mru in * arc_evict because we don't have a "target" value (i.e. no * "meta" arc_p). As a result, I think we can completely * cannibalize the metadata in the MRU before we evict the * metadata from the MFU. I think we probably need to implement a * "metadata arc_p" value to do this properly. */ adjustmnt = meta_used - arc_meta_limit; if (adjustmnt > 0 && zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]), adjustmnt); total_evicted += arc_evict_impl(arc_mru, 0, delta, type); adjustmnt -= delta; } /* * We can't afford to recalculate adjustmnt here. If we do, * new metadata buffers can sneak into the MRU or ANON lists, * thus penalize the MFU metadata. Although the fudge factor is * small, it has been empirically shown to be significant for * certain workloads (e.g. creating many empty directories). As * such, we use the original calculation for adjustmnt, and * simply decrement the amount of data evicted from the MRU. */ if (adjustmnt > 0 && zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]), adjustmnt); total_evicted += arc_evict_impl(arc_mfu, 0, delta, type); } adjustmnt = meta_used - arc_meta_limit; if (adjustmnt > 0 && zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) { delta = MIN(adjustmnt, zfs_refcount_count(&arc_mru_ghost->arcs_esize[type])); total_evicted += arc_evict_impl(arc_mru_ghost, 0, delta, type); adjustmnt -= delta; } if (adjustmnt > 0 && zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) { delta = MIN(adjustmnt, zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type])); total_evicted += arc_evict_impl(arc_mfu_ghost, 0, delta, type); } /* * If after attempting to make the requested adjustment to the ARC * the meta limit is still being exceeded then request that the * higher layers drop some cached objects which have holds on ARC * meta buffers. Requests to the upper layers will be made with * increasingly large scan sizes until the ARC is below the limit. */ if (meta_used > arc_meta_limit) { if (type == ARC_BUFC_DATA) { type = ARC_BUFC_METADATA; } else { type = ARC_BUFC_DATA; if (zfs_arc_meta_prune) { prune += zfs_arc_meta_prune; arc_prune_async(prune); } } if (restarts > 0) { restarts--; goto restart; } } return (total_evicted); } /* * Evict metadata buffers from the cache, such that arc_meta_used is * capped by the arc_meta_limit tunable. */ static uint64_t arc_evict_meta_only(uint64_t meta_used) { uint64_t total_evicted = 0; int64_t target; /* * If we're over the meta limit, we want to evict enough * metadata to get back under the meta limit. We don't want to * evict so much that we drop the MRU below arc_p, though. If * we're over the meta limit more than we're over arc_p, we * evict some from the MRU here, and some from the MFU below. */ target = MIN((int64_t)(meta_used - arc_meta_limit), (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + zfs_refcount_count(&arc_mru->arcs_size) - arc_p)); total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); /* * Similar to the above, we want to evict enough bytes to get us * below the meta limit, but not so much as to drop us below the * space allotted to the MFU (which is defined as arc_c - arc_p). */ target = MIN((int64_t)(meta_used - arc_meta_limit), (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); return (total_evicted); } static uint64_t arc_evict_meta(uint64_t meta_used) { if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY) return (arc_evict_meta_only(meta_used)); else return (arc_evict_meta_balanced(meta_used)); } /* * Return the type of the oldest buffer in the given arc state * * This function will select a random sublist of type ARC_BUFC_DATA and * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist * is compared, and the type which contains the "older" buffer will be * returned. */ static arc_buf_contents_t arc_evict_type(arc_state_t *state) { multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA]; multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA]; int data_idx = multilist_get_random_index(data_ml); int meta_idx = multilist_get_random_index(meta_ml); multilist_sublist_t *data_mls; multilist_sublist_t *meta_mls; arc_buf_contents_t type; arc_buf_hdr_t *data_hdr; arc_buf_hdr_t *meta_hdr; /* * We keep the sublist lock until we're finished, to prevent * the headers from being destroyed via arc_evict_state(). */ data_mls = multilist_sublist_lock(data_ml, data_idx); meta_mls = multilist_sublist_lock(meta_ml, meta_idx); /* * These two loops are to ensure we skip any markers that * might be at the tail of the lists due to arc_evict_state(). */ for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { if (data_hdr->b_spa != 0) break; } for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { if (meta_hdr->b_spa != 0) break; } if (data_hdr == NULL && meta_hdr == NULL) { type = ARC_BUFC_DATA; } else if (data_hdr == NULL) { ASSERT3P(meta_hdr, !=, NULL); type = ARC_BUFC_METADATA; } else if (meta_hdr == NULL) { ASSERT3P(data_hdr, !=, NULL); type = ARC_BUFC_DATA; } else { ASSERT3P(data_hdr, !=, NULL); ASSERT3P(meta_hdr, !=, NULL); /* The headers can't be on the sublist without an L1 header */ ASSERT(HDR_HAS_L1HDR(data_hdr)); ASSERT(HDR_HAS_L1HDR(meta_hdr)); if (data_hdr->b_l1hdr.b_arc_access < meta_hdr->b_l1hdr.b_arc_access) { type = ARC_BUFC_DATA; } else { type = ARC_BUFC_METADATA; } } multilist_sublist_unlock(meta_mls); multilist_sublist_unlock(data_mls); return (type); } /* * Evict buffers from the cache, such that arc_size is capped by arc_c. */ static uint64_t arc_evict(void) { uint64_t total_evicted = 0; uint64_t bytes; int64_t target; uint64_t asize = aggsum_value(&arc_size); uint64_t ameta = aggsum_value(&arc_meta_used); /* * If we're over arc_meta_limit, we want to correct that before * potentially evicting data buffers below. */ total_evicted += arc_evict_meta(ameta); /* * Adjust MRU size * * If we're over the target cache size, we want to evict enough * from the list to get back to our target size. We don't want * to evict too much from the MRU, such that it drops below * arc_p. So, if we're over our target cache size more than * the MRU is over arc_p, we'll evict enough to get back to * arc_p here, and then evict more from the MFU below. */ target = MIN((int64_t)(asize - arc_c), (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p)); /* * If we're below arc_meta_min, always prefer to evict data. * Otherwise, try to satisfy the requested number of bytes to * evict from the type which contains older buffers; in an * effort to keep newer buffers in the cache regardless of their * type. If we cannot satisfy the number of bytes from this * type, spill over into the next type. */ if (arc_evict_type(arc_mru) == ARC_BUFC_METADATA && ameta > arc_meta_min) { bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * metadata, we try to get the rest from data. */ target -= bytes; total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); } else { bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * data, we try to get the rest from metadata. */ target -= bytes; total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); } /* * Re-sum ARC stats after the first round of evictions. */ asize = aggsum_value(&arc_size); ameta = aggsum_value(&arc_meta_used); /* * Adjust MFU size * * Now that we've tried to evict enough from the MRU to get its * size back to arc_p, if we're still above the target cache * size, we evict the rest from the MFU. */ target = asize - arc_c; if (arc_evict_type(arc_mfu) == ARC_BUFC_METADATA && ameta > arc_meta_min) { bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * metadata, we try to get the rest from data. */ target -= bytes; total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); } else { bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * data, we try to get the rest from data. */ target -= bytes; total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); } /* * Adjust ghost lists * * In addition to the above, the ARC also defines target values * for the ghost lists. The sum of the mru list and mru ghost * list should never exceed the target size of the cache, and * the sum of the mru list, mfu list, mru ghost list, and mfu * ghost list should never exceed twice the target size of the * cache. The following logic enforces these limits on the ghost * caches, and evicts from them as needed. */ target = zfs_refcount_count(&arc_mru->arcs_size) + zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c; bytes = arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); total_evicted += bytes; target -= bytes; total_evicted += arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); /* * We assume the sum of the mru list and mfu list is less than * or equal to arc_c (we enforced this above), which means we * can use the simpler of the two equations below: * * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c * mru ghost + mfu ghost <= arc_c */ target = zfs_refcount_count(&arc_mru_ghost->arcs_size) + zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; bytes = arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); total_evicted += bytes; target -= bytes; total_evicted += arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); return (total_evicted); } void arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; /* * If retry is B_TRUE, a spa must not be specified since we have * no good way to determine if all of a spa's buffers have been * evicted from an arc state. */ ASSERT(!retry || spa == 0); if (spa != NULL) guid = spa_load_guid(spa); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); } void arc_reduce_target_size(int64_t to_free) { uint64_t asize = aggsum_value(&arc_size); /* * All callers want the ARC to actually evict (at least) this much * memory. Therefore we reduce from the lower of the current size and * the target size. This way, even if arc_c is much higher than * arc_size (as can be the case after many calls to arc_freed(), we will * immediately have arc_c < arc_size and therefore the arc_evict_zthr * will evict. */ uint64_t c = MIN(arc_c, asize); if (c > to_free && c - to_free > arc_c_min) { arc_c = c - to_free; atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); if (arc_p > arc_c) arc_p = (arc_c >> 1); ASSERT(arc_c >= arc_c_min); ASSERT((int64_t)arc_p >= 0); } else { arc_c = arc_c_min; } if (asize > arc_c) { /* See comment in arc_evict_cb_check() on why lock+flag */ mutex_enter(&arc_evict_lock); arc_evict_needed = B_TRUE; mutex_exit(&arc_evict_lock); zthr_wakeup(arc_evict_zthr); } } /* * Determine if the system is under memory pressure and is asking * to reclaim memory. A return value of B_TRUE indicates that the system * is under memory pressure and that the arc should adjust accordingly. */ boolean_t arc_reclaim_needed(void) { return (arc_available_memory() < 0); } void arc_kmem_reap_soon(void) { size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; #ifdef _KERNEL if ((aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) && zfs_arc_meta_prune) { /* * We are exceeding our meta-data cache limit. * Prune some entries to release holds on meta-data. */ arc_prune_async(zfs_arc_meta_prune); } #if defined(_ILP32) /* * Reclaim unused memory from all kmem caches. */ kmem_reap(); #endif #endif for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { #if defined(_ILP32) /* reach upper limit of cache size on 32-bit */ if (zio_buf_cache[i] == NULL) break; #endif if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; kmem_cache_reap_now(zio_buf_cache[i]); } if (zio_data_buf_cache[i] != prev_data_cache) { prev_data_cache = zio_data_buf_cache[i]; kmem_cache_reap_now(zio_data_buf_cache[i]); } } kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); kmem_cache_reap_now(zfs_btree_leaf_cache); abd_cache_reap_now(); } /* ARGSUSED */ static boolean_t arc_evict_cb_check(void *arg, zthr_t *zthr) { /* * This is necessary so that any changes which may have been made to * many of the zfs_arc_* module parameters will be propagated to * their actual internal variable counterparts. Without this, * changing those module params at runtime would have no effect. */ arc_tuning_update(B_FALSE); /* * This is necessary in order to keep the kstat information * up to date for tools that display kstat data such as the * mdb ::arc dcmd and the Linux crash utility. These tools * typically do not call kstat's update function, but simply * dump out stats from the most recent update. Without * this call, these commands may show stale stats for the * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even * with this change, the data might be up to 1 second * out of date(the arc_evict_zthr has a maximum sleep * time of 1 second); but that should suffice. The * arc_state_t structures can be queried directly if more * accurate information is needed. */ if (arc_ksp != NULL) arc_ksp->ks_update(arc_ksp, KSTAT_READ); /* * We have to rely on arc_wait_for_eviction() to tell us when to * evict, rather than checking if we are overflowing here, so that we * are sure to not leave arc_wait_for_eviction() waiting on aew_cv. * If we have become "not overflowing" since arc_wait_for_eviction() * checked, we need to wake it up. We could broadcast the CV here, * but arc_wait_for_eviction() may have not yet gone to sleep. We * would need to use a mutex to ensure that this function doesn't * broadcast until arc_wait_for_eviction() has gone to sleep (e.g. * the arc_evict_lock). However, the lock ordering of such a lock * would necessarily be incorrect with respect to the zthr_lock, * which is held before this function is called, and is held by * arc_wait_for_eviction() when it calls zthr_wakeup(). */ return (arc_evict_needed); } /* * Keep arc_size under arc_c by running arc_evict which evicts data * from the ARC. */ /* ARGSUSED */ static void arc_evict_cb(void *arg, zthr_t *zthr) { uint64_t evicted = 0; fstrans_cookie_t cookie = spl_fstrans_mark(); /* Evict from cache */ evicted = arc_evict(); /* * If evicted is zero, we couldn't evict anything * via arc_evict(). This could be due to hash lock * collisions, but more likely due to the majority of * arc buffers being unevictable. Therefore, even if * arc_size is above arc_c, another pass is unlikely to * be helpful and could potentially cause us to enter an * infinite loop. Additionally, zthr_iscancelled() is * checked here so that if the arc is shutting down, the * broadcast will wake any remaining arc evict waiters. */ mutex_enter(&arc_evict_lock); arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) && evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0; if (!arc_evict_needed) { /* * We're either no longer overflowing, or we * can't evict anything more, so we should wake * arc_get_data_impl() sooner. */ arc_evict_waiter_t *aw; while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) { cv_broadcast(&aw->aew_cv); } arc_set_need_free(); } mutex_exit(&arc_evict_lock); spl_fstrans_unmark(cookie); } /* ARGSUSED */ static boolean_t arc_reap_cb_check(void *arg, zthr_t *zthr) { int64_t free_memory = arc_available_memory(); /* * If a kmem reap is already active, don't schedule more. We must * check for this because kmem_cache_reap_soon() won't actually * block on the cache being reaped (this is to prevent callers from * becoming implicitly blocked by a system-wide kmem reap -- which, * on a system with many, many full magazines, can take minutes). */ if (!kmem_cache_reap_active() && free_memory < 0) { arc_no_grow = B_TRUE; arc_warm = B_TRUE; /* * Wait at least zfs_grow_retry (default 5) seconds * before considering growing. */ arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); return (B_TRUE); } else if (free_memory < arc_c >> arc_no_grow_shift) { arc_no_grow = B_TRUE; } else if (gethrtime() >= arc_growtime) { arc_no_grow = B_FALSE; } return (B_FALSE); } /* * Keep enough free memory in the system by reaping the ARC's kmem * caches. To cause more slabs to be reapable, we may reduce the * target size of the cache (arc_c), causing the arc_evict_cb() * to free more buffers. */ /* ARGSUSED */ static void arc_reap_cb(void *arg, zthr_t *zthr) { int64_t free_memory; fstrans_cookie_t cookie = spl_fstrans_mark(); /* * Kick off asynchronous kmem_reap()'s of all our caches. */ arc_kmem_reap_soon(); /* * Wait at least arc_kmem_cache_reap_retry_ms between * arc_kmem_reap_soon() calls. Without this check it is possible to * end up in a situation where we spend lots of time reaping * caches, while we're near arc_c_min. Waiting here also gives the * subsequent free memory check a chance of finding that the * asynchronous reap has already freed enough memory, and we don't * need to call arc_reduce_target_size(). */ delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000); /* * Reduce the target size as needed to maintain the amount of free * memory in the system at a fraction of the arc_size (1/128th by * default). If oversubscribed (free_memory < 0) then reduce the * target arc_size by the deficit amount plus the fractional * amount. If free memory is positive but less then the fractional * amount, reduce by what is needed to hit the fractional amount. */ free_memory = arc_available_memory(); int64_t to_free = (arc_c >> arc_shrink_shift) - free_memory; if (to_free > 0) { arc_reduce_target_size(to_free); } spl_fstrans_unmark(cookie); } #ifdef _KERNEL /* * Determine the amount of memory eligible for eviction contained in the * ARC. All clean data reported by the ghost lists can always be safely * evicted. Due to arc_c_min, the same does not hold for all clean data * contained by the regular mru and mfu lists. * * In the case of the regular mru and mfu lists, we need to report as * much clean data as possible, such that evicting that same reported * data will not bring arc_size below arc_c_min. Thus, in certain * circumstances, the total amount of clean data in the mru and mfu * lists might not actually be evictable. * * The following two distinct cases are accounted for: * * 1. The sum of the amount of dirty data contained by both the mru and * mfu lists, plus the ARC's other accounting (e.g. the anon list), * is greater than or equal to arc_c_min. * (i.e. amount of dirty data >= arc_c_min) * * This is the easy case; all clean data contained by the mru and mfu * lists is evictable. Evicting all clean data can only drop arc_size * to the amount of dirty data, which is greater than arc_c_min. * * 2. The sum of the amount of dirty data contained by both the mru and * mfu lists, plus the ARC's other accounting (e.g. the anon list), * is less than arc_c_min. * (i.e. arc_c_min > amount of dirty data) * * 2.1. arc_size is greater than or equal arc_c_min. * (i.e. arc_size >= arc_c_min > amount of dirty data) * * In this case, not all clean data from the regular mru and mfu * lists is actually evictable; we must leave enough clean data * to keep arc_size above arc_c_min. Thus, the maximum amount of * evictable data from the two lists combined, is exactly the * difference between arc_size and arc_c_min. * * 2.2. arc_size is less than arc_c_min * (i.e. arc_c_min > arc_size > amount of dirty data) * * In this case, none of the data contained in the mru and mfu * lists is evictable, even if it's clean. Since arc_size is * already below arc_c_min, evicting any more would only * increase this negative difference. */ #endif /* _KERNEL */ /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are coming from. This function is only called * when we are adding new content to the cache. */ static void arc_adapt(int bytes, arc_state_t *state) { int mult; uint64_t arc_p_min = (arc_c >> arc_p_min_shift); int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size); int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size); ASSERT(bytes > 0); /* * Adapt the target size of the MRU list: * - if we just hit in the MRU ghost list, then increase * the target size of the MRU list. * - if we just hit in the MFU ghost list, then increase * the target size of the MFU list by decreasing the * target size of the MRU list. */ if (state == arc_mru_ghost) { mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); if (!zfs_arc_p_dampener_disable) mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { uint64_t delta; mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); if (!zfs_arc_p_dampener_disable) mult = MIN(mult, 10); delta = MIN(bytes * mult, arc_p); arc_p = MAX(arc_p_min, arc_p - delta); } ASSERT((int64_t)arc_p >= 0); /* * Wake reap thread if we do not have any available memory */ if (arc_reclaim_needed()) { zthr_wakeup(arc_reap_zthr); return; } if (arc_no_grow) return; if (arc_c >= arc_c_max) return; /* * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT); if (aggsum_upper_bound(&arc_size) >= arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { atomic_add_64(&arc_c, (int64_t)bytes); if (arc_c > arc_c_max) arc_c = arc_c_max; else if (state == arc_anon) atomic_add_64(&arc_p, (int64_t)bytes); if (arc_p > arc_c) arc_p = arc_c; } ASSERT((int64_t)arc_p >= 0); } /* * Check if arc_size has grown past our upper threshold, determined by * zfs_arc_overflow_shift. */ boolean_t arc_is_overflowing(void) { /* Always allow at least one block of overflow */ int64_t overflow = MAX(SPA_MAXBLOCKSIZE, arc_c >> zfs_arc_overflow_shift); /* * We just compare the lower bound here for performance reasons. Our * primary goals are to make sure that the arc never grows without * bound, and that it can reach its maximum size. This check * accomplishes both goals. The maximum amount we could run over by is * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block * in the ARC. In practice, that's in the tens of MB, which is low * enough to be safe. */ return (aggsum_lower_bound(&arc_size) >= (int64_t)arc_c + overflow); } static abd_t * arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag, boolean_t do_adapt) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag, do_adapt); if (type == ARC_BUFC_METADATA) { return (abd_alloc(size, B_TRUE)); } else { ASSERT(type == ARC_BUFC_DATA); return (abd_alloc(size, B_FALSE)); } } static void * arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag, B_TRUE); if (type == ARC_BUFC_METADATA) { return (zio_buf_alloc(size)); } else { ASSERT(type == ARC_BUFC_DATA); return (zio_data_buf_alloc(size)); } } /* * Wait for the specified amount of data (in bytes) to be evicted from the * ARC, and for there to be sufficient free memory in the system. Waiting for * eviction ensures that the memory used by the ARC decreases. Waiting for * free memory ensures that the system won't run out of free pages, regardless * of ARC behavior and settings. See arc_lowmem_init(). */ void arc_wait_for_eviction(uint64_t amount) { mutex_enter(&arc_evict_lock); if (arc_is_overflowing()) { arc_evict_needed = B_TRUE; zthr_wakeup(arc_evict_zthr); if (amount != 0) { arc_evict_waiter_t aw; list_link_init(&aw.aew_node); cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL); arc_evict_waiter_t *last = list_tail(&arc_evict_waiters); if (last != NULL) { ASSERT3U(last->aew_count, >, arc_evict_count); aw.aew_count = last->aew_count + amount; } else { aw.aew_count = arc_evict_count + amount; } list_insert_tail(&arc_evict_waiters, &aw); arc_set_need_free(); DTRACE_PROBE3(arc__wait__for__eviction, uint64_t, amount, uint64_t, arc_evict_count, uint64_t, aw.aew_count); /* * We will be woken up either when arc_evict_count * reaches aew_count, or when the ARC is no longer * overflowing and eviction completes. */ cv_wait(&aw.aew_cv, &arc_evict_lock); /* * In case of "false" wakeup, we will still be on the * list. */ if (list_link_active(&aw.aew_node)) list_remove(&arc_evict_waiters, &aw); cv_destroy(&aw.aew_cv); } } mutex_exit(&arc_evict_lock); } /* * Allocate a block and return it to the caller. If we are hitting the * hard limit for the cache size, we must sleep, waiting for the eviction * thread to catch up. If we're past the target size but below the hard * limit, we'll only signal the reclaim thread and continue on. */ static void arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, boolean_t do_adapt) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); if (do_adapt) arc_adapt(size, state); /* * If arc_size is currently overflowing, we must be adding data * faster than we are evicting. To ensure we don't compound the * problem by adding more data and forcing arc_size to grow even * further past it's target size, we wait for the eviction thread to * make some progress. We also wait for there to be sufficient free * memory in the system, as measured by arc_free_memory(). * * Specifically, we wait for zfs_arc_eviction_pct percent of the * requested size to be evicted. This should be more than 100%, to * ensure that that progress is also made towards getting arc_size * under arc_c. See the comment above zfs_arc_eviction_pct. * * We do the overflowing check without holding the arc_evict_lock to * reduce lock contention in this hot path. Note that * arc_wait_for_eviction() will acquire the lock and check again to * ensure we are truly overflowing before blocking. */ if (arc_is_overflowing()) { arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100); } VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { arc_space_consume(size, ARC_SPACE_META); } else { arc_space_consume(size, ARC_SPACE_DATA); } /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ if (!GHOST_STATE(state)) { (void) zfs_refcount_add_many(&state->arcs_size, size, tag); /* * If this is reached via arc_read, the link is * protected by the hash lock. If reached via * arc_buf_alloc, the header should not be accessed by * any other thread. And, if reached via arc_read_done, * the hash lock will protect it if it's found in the * hash table; otherwise no other thread should be * trying to [add|remove]_reference it. */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); (void) zfs_refcount_add_many(&state->arcs_esize[type], size, tag); } /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ if (aggsum_upper_bound(&arc_size) < arc_c && hdr->b_l1hdr.b_state == arc_anon && (zfs_refcount_count(&arc_anon->arcs_size) + zfs_refcount_count(&arc_mru->arcs_size) > arc_p)) arc_p = MIN(arc_c, arc_p + size); } } static void arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) { arc_free_data_impl(hdr, size, tag); abd_free(abd); } static void arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_free_data_impl(hdr, size, tag); if (type == ARC_BUFC_METADATA) { zio_buf_free(buf, size); } else { ASSERT(type == ARC_BUFC_DATA); zio_data_buf_free(buf, size); } } /* * Free the arc data buffer. */ static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, tag); } (void) zfs_refcount_remove_many(&state->arcs_size, size, tag); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_space_return(size, ARC_SPACE_DATA); } } /* * This routine is called whenever a buffer is accessed. * NOTE: the hash lock is dropped in this function. */ static void arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { clock_t now; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); if (hdr->b_l1hdr.b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ ASSERT0(hdr->b_l1hdr.b_arc_access); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); arc_change_state(arc_mru, hdr, hash_lock); } else if (hdr->b_l1hdr.b_state == arc_mru) { now = ddi_get_lbolt(); /* * If this buffer is here because of a prefetch, then either: * - clear the flag if this is a "referencing" read * (any subsequent access will bump this into the MFU state). * or * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { /* link protected by hash lock */ ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } hdr->b_l1hdr.b_arc_access = now; return; } /* * This buffer has been "accessed" only once so far, * but it is still in the cache. Move it to the MFU * state. */ if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access + ARC_MINTIME)) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been "accessed" recently, but * was evicted from the cache. Move it to the * MFU state. */ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { new_state = arc_mru; if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); } DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, hdr, hash_lock); atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits); ARCSTAT_BUMP(arcstat_mru_ghost_hits); } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. * * NOTE: an add_reference() that occurred when we did * the arc_read() will have kicked this off the list. * If it was a prefetch, we will explicitly move it to * the head of the list now. */ atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits); ARCSTAT_BUMP(arcstat_mfu_hits); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* * This buffer has been accessed more than once but has * been evicted from the cache. Move it back to the * MFU state. */ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { /* * This is a prefetch access... * move this block back to the MRU state. */ new_state = arc_mru; } hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(new_state, hdr, hash_lock); atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC. */ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } else { cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_l1hdr.b_state); } } /* * This routine is called by dbuf_hold() to update the arc_access() state * which otherwise would be skipped for entries in the dbuf cache. */ void arc_buf_access(arc_buf_t *buf) { mutex_enter(&buf->b_evict_lock); arc_buf_hdr_t *hdr = buf->b_hdr; /* * Avoid taking the hash_lock when possible as an optimization. * The header must be checked again under the hash_lock in order * to handle the case where it is concurrently being released. */ if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { mutex_exit(&buf->b_evict_lock); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { mutex_exit(hash_lock); mutex_exit(&buf->b_evict_lock); ARCSTAT_BUMP(arcstat_access_skip); return; } mutex_exit(&buf->b_evict_lock); ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr) && !HDR_PRESCIENT_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } /* a generic arc_read_done_func_t which you can use */ /* ARGSUSED */ void arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { if (buf == NULL) return; bcopy(buf->b_data, arg, arc_buf_size(buf)); arc_buf_destroy(buf, arg); } /* a generic arc_read_done_func_t */ /* ARGSUSED */ void arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; if (buf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); *bufp = NULL; } else { ASSERT(zio == NULL || zio->io_error == 0); *bufp = buf; ASSERT(buf->b_data != NULL); } } static void arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) { if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF); } else { if (HDR_COMPRESSION_ENABLED(hdr)) { ASSERT3U(arc_hdr_get_compress(hdr), ==, BP_GET_COMPRESS(bp)); } ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp)); } } static void arc_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; arc_buf_hdr_t *hdr = zio->io_private; kmutex_t *hash_lock = NULL; arc_callback_t *callback_list; arc_callback_t *acb; boolean_t freeable = B_FALSE; /* * The hdr was inserted into hash-table and removed from lists * prior to starting I/O. We should find this header, since * it's in the hash table, and it should be legit since it's * not possible to evict it during the I/O. The only possible * reason for it not to be found is if we were freed during the * read. */ if (HDR_IN_HASH_TABLE(hdr)) { arc_buf_hdr_t *found; ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); ASSERT3U(hdr->b_dva.dva_word[0], ==, BP_IDENTITY(zio->io_bp)->dva_word[0]); ASSERT3U(hdr->b_dva.dva_word[1], ==, BP_IDENTITY(zio->io_bp)->dva_word[1]); found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); ASSERT((found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); ASSERT3P(hash_lock, !=, NULL); } if (BP_IS_PROTECTED(bp)) { hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv); if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) { void *tmpbuf; tmpbuf = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); zio_crypt_decode_mac_zil(tmpbuf, hdr->b_crypt_hdr.b_mac); abd_return_buf(zio->io_abd, tmpbuf, sizeof (zil_chain_t)); } else { zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); } } if (zio->io_error == 0) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { if (BP_GET_LEVEL(zio->io_bp) > 0) { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; } else { hdr->b_l1hdr.b_byteswap = DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); } } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } if (!HDR_L2_READING(hdr)) { hdr->b_complevel = zio->io_prop.zp_complevel; } } arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); if (l2arc_noprefetch && HDR_PREFETCH(hdr)) arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); if (hash_lock && zio->io_error == 0 && hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already * called arc_access (to prevent any simultaneous readers from * getting confused). */ arc_access(hdr, hash_lock); } /* * If a read request has a callback (i.e. acb_done is not NULL), then we * make a buf containing the data according to the parameters which were * passed in. The implementation of arc_buf_alloc_impl() ensures that we * aren't needlessly decompressing the data multiple times. */ int callback_cnt = 0; for (acb = callback_list; acb != NULL; acb = acb->acb_next) { if (!acb->acb_done) continue; callback_cnt++; if (zio->io_error != 0) continue; int error = arc_buf_alloc_impl(hdr, zio->io_spa, &acb->acb_zb, acb->acb_private, acb->acb_encrypted, acb->acb_compressed, acb->acb_noauth, B_TRUE, &acb->acb_buf); /* * Assert non-speculative zios didn't fail because an * encryption key wasn't loaded */ ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) || error != EACCES); /* * If we failed to decrypt, report an error now (as the zio * layer would have done if it had done the transforms). */ if (error == ECKSUM) { ASSERT(BP_IS_PROTECTED(bp)); error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(zio->io_spa, &acb->acb_zb); - zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, + (void) zfs_ereport_post( + FM_EREPORT_ZFS_AUTHENTICATION, zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0); } } if (error != 0) { /* * Decompression or decryption failed. Set * io_error so that when we call acb_done * (below), we will indicate that the read * failed. Note that in the unusual case * where one callback is compressed and another * uncompressed, we will mark all of them * as failed, even though the uncompressed * one can't actually fail. In this case, * the hdr will not be anonymous, because * if there are multiple callbacks, it's * because multiple threads found the same * arc buf in the hash table. */ zio->io_error = error; } } /* * If there are multiple callbacks, we must have the hash lock, * because the only way for multiple threads to find this hdr is * in the hash table. This ensures that if there are multiple * callbacks, the hdr is not anonymous. If it were anonymous, * we couldn't use arc_buf_destroy() in the error case below. */ ASSERT(callback_cnt < 2 || hash_lock != NULL); hdr->b_l1hdr.b_acb = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (callback_cnt == 0) ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* * Broadcast before we drop the hash_lock to avoid the possibility * that the hdr (and hence the cv) might be freed before we get to * the cv_broadcast(). */ cv_broadcast(&hdr->b_l1hdr.b_cv); if (hash_lock != NULL) { mutex_exit(hash_lock); } else { /* * This block was freed while we waited for the read to * complete. It has been removed from the hash table and * moved to the anonymous state (so that it won't show up * in the cache). */ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { if (acb->acb_done != NULL) { if (zio->io_error != 0 && acb->acb_buf != NULL) { /* * If arc_buf_alloc_impl() fails during * decompression, the buf will still be * allocated, and needs to be freed here. */ arc_buf_destroy(acb->acb_buf, acb->acb_private); acb->acb_buf = NULL; } acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, acb->acb_buf, acb->acb_private); } if (acb->acb_zio_dummy != NULL) { acb->acb_zio_dummy->io_error = zio->io_error; zio_nowait(acb->acb_zio_dummy); } callback_list = acb->acb_next; kmem_free(acb, sizeof (arc_callback_t)); } if (freeable) arc_hdr_destroy(hdr); } /* * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was * required. If the block is not in the cache pass the read request * on to the spa with a substitute callback function, so that the * requested block will be added to the cache. * * If a read request arrives for a block that has a read in-progress, * either wait for the in-progress read to complete (and return the * results); or, if this is a read with a "done" func, add a record * to the read to invoke the "done" func when the read completes, * and return; or just return. * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0; boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) && (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) && (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp); int rc = 0; ASSERT(!embedded_bp || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); ASSERT(!BP_IS_HOLE(bp)); ASSERT(!BP_IS_REDACTED(bp)); /* * Normally SPL_FSTRANS will already be set since kernel threads which * expect to call the DMU interfaces will set it when created. System * calls are similarly handled by setting/cleaning the bit in the * registered callback (module/os/.../zfs/zpl_*). * * External consumers such as Lustre which call the exported DMU * interfaces may not have set SPL_FSTRANS. To avoid a deadlock * on the hash_lock always set and clear the bit. */ fstrans_cookie_t cookie = spl_fstrans_mark(); top: if (!embedded_bp) { /* * Embedded BP's have no DVA and require no I/O to "read". * Create an anonymous arc buf to back it. */ hdr = buf_hash_find(guid, bp, &hash_lock); } /* * Determine if we have an L1 cache hit or a cache miss. For simplicity * we maintain encrypted data separately from compressed / uncompressed * data. If the user is requesting raw encrypted data and we don't have * that in the header we will read from disk to guarantee that we can * get it even if the encryption keys aren't loaded. */ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) || (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) { arc_buf_t *buf = NULL; *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; if (*arc_flags & ARC_FLAG_CACHED_ONLY) { mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_cached_only_in_progress); rc = SET_ERROR(ENOENT); goto out; } ASSERT3P(head_zio, !=, NULL); if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { /* * This is a sync read that needs to wait for * an in-flight async read. Request that the * zio have its priority upgraded. */ zio_change_priority(head_zio, priority); DTRACE_PROBE1(arc__async__upgrade__sync, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_async_upgrade_sync); } if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { arc_hdr_clear_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); } if (*arc_flags & ARC_FLAG_WAIT) { cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); mutex_exit(hash_lock); goto top; } ASSERT(*arc_flags & ARC_FLAG_NOWAIT); if (done) { arc_callback_t *acb = NULL; acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; acb->acb_zb = *zb; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); ASSERT3P(acb->acb_done, !=, NULL); acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; mutex_exit(hash_lock); goto out; } mutex_exit(hash_lock); goto out; } ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu); if (done) { if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { /* * This is a demand read which does not have to * wait for i/o because we did a predictive * prefetch i/o for it, which has completed. */ DTRACE_PROBE1( arc__demand__hit__predictive__prefetch, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP( arcstat_demand_hit_predictive_prefetch); arc_hdr_clear_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); } if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { ARCSTAT_BUMP( arcstat_demand_hit_prescient_prefetch); arc_hdr_clear_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); } ASSERT(!embedded_bp || !BP_IS_HOLE(bp)); /* Get a buf with the desired data in it. */ rc = arc_buf_alloc_impl(hdr, spa, zb, private, encrypted_read, compressed_read, noauth_read, B_TRUE, &buf); if (rc == ECKSUM) { /* * Convert authentication and decryption errors * to EIO (and generate an ereport if needed) * before leaving the ARC. */ rc = SET_ERROR(EIO); if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, zb); - zfs_ereport_post( + (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0, 0); } } if (rc != 0) { (void) remove_reference(hdr, hash_lock, private); arc_buf_destroy_impl(buf); buf = NULL; } /* assert any errors weren't due to unloaded keys */ ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc != EACCES); } else if (*arc_flags & ARC_FLAG_PREFETCH && zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); if (done) done(NULL, zb, bp, buf, private); } else { uint64_t lsize = BP_GET_LSIZE(bp); uint64_t psize = BP_GET_PSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; uint64_t size; abd_t *hdr_abd; int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0; if (*arc_flags & ARC_FLAG_CACHED_ONLY) { rc = SET_ERROR(ENOENT); if (hash_lock != NULL) mutex_exit(hash_lock); goto out; } /* * Gracefully handle a damaged logical block size as a * checksum error. */ if (lsize > spa_maxblocksize(spa)) { rc = SET_ERROR(ECKSUM); if (hash_lock != NULL) mutex_exit(hash_lock); goto out; } if (hdr == NULL) { /* * This block is not in the cache or it has * embedded data. */ arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type, encrypted_read); if (!embedded_bp) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } } else { /* * This block is in the ghost cache or encrypted data * was requested and we didn't have it. If it was * L2-only (and thus didn't have an L1 hdr), * we realloc the header to add an L1 hdr. */ if (!HDR_HAS_L1HDR(hdr)) { hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, hdr_full_cache); } if (GHOST_STATE(hdr->b_l1hdr.b_state)) { ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT0(zfs_refcount_count( &hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); } else if (HDR_IO_IN_PROGRESS(hdr)) { /* * If this header already had an IO in progress * and we are performing another IO to fetch * encrypted data we must wait until the first * IO completes so as not to confuse * arc_read_done(). This should be very rare * and so the performance impact shouldn't * matter. */ cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); mutex_exit(hash_lock); goto top; } /* * This is a delicate dance that we play here. * This hdr might be in the ghost list so we access * it to move it out of the ghost list before we * initiate the read. If it's a prefetch then * it won't have a callback so we'll remove the * reference that arc_buf_alloc_impl() created. We * do this after we've called arc_access() to * avoid hitting an assert in remove_reference(). */ arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); arc_access(hdr, hash_lock); arc_hdr_alloc_abd(hdr, alloc_flags); } if (encrypted_read) { ASSERT(HDR_HAS_RABD(hdr)); size = HDR_GET_PSIZE(hdr); hdr_abd = hdr->b_crypt_hdr.b_rabd; zio_flags |= ZIO_FLAG_RAW; } else { ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); size = arc_hdr_size(hdr); hdr_abd = hdr->b_l1hdr.b_pabd; if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { zio_flags |= ZIO_FLAG_RAW_COMPRESS; } /* * For authenticated bp's, we do not ask the ZIO layer * to authenticate them since this will cause the entire * IO to fail if the key isn't loaded. Instead, we * defer authentication until arc_buf_fill(), which will * verify the data when the key is available. */ if (BP_IS_AUTHENTICATED(bp)) zio_flags |= ZIO_FLAG_RAW_ENCRYPT; } if (*arc_flags & ARC_FLAG_PREFETCH && zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (BP_IS_AUTHENTICATED(bp)) arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); if (BP_GET_LEVEL(bp) > 0) arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; acb->acb_zb = *zb; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; /* * Lock out L2ARC device removal. */ if (vdev_is_dead(vd) || !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) vd = NULL; } /* * We count both async reads and scrub IOs as asynchronous so * that both can be upgraded in the event of a cache hit while * the read IO is still in-flight. */ if (priority == ZIO_PRIORITY_ASYNC_READ || priority == ZIO_PRIORITY_SCRUB) arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); else arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); /* * At this point, we have a level 1 cache miss or a blkptr * with embedded data. Try again in L2ARC if possible. */ ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); /* * Skip ARC stat bump for block pointers with embedded * data. The data are read from the blkptr itself via * decode_embedded_bp_compressed(). */ if (!embedded_bp) { DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); } if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. * 2. This buffer still has L2ARC metadata. * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. * 5. This isn't prefetch and l2arc_noprefetch is set. */ if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; abd_t *abd; uint64_t asize; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); atomic_inc_32(&hdr->b_l2hdr.b_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; /* * When Compressed ARC is disabled, but the * L2ARC block is compressed, arc_hdr_size() * will have returned LSIZE rather than PSIZE. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr) && HDR_GET_PSIZE(hdr) != 0) { size = HDR_GET_PSIZE(hdr); } asize = vdev_psize_to_asize(vd, size); if (asize != size) { abd = abd_alloc_for_io(asize, HDR_ISTYPE_METADATA(hdr)); cb->l2rcb_abd = abd; } else { abd = hdr_abd; } ASSERT(addr >= VDEV_LABEL_START_SIZE && addr + asize <= vd->vdev_psize - VDEV_LABEL_END_SIZE); /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ ASSERT3U(arc_hdr_get_compress(hdr), !=, ZIO_COMPRESS_EMPTY); rzio = zio_read_phys(pio, vd, addr, asize, abd, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); acb->acb_zio_head = rzio; if (hash_lock != NULL) mutex_exit(hash_lock); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, HDR_GET_PSIZE(hdr)); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); goto out; } ASSERT(*arc_flags & ARC_FLAG_WAIT); if (zio_wait(rzio) == 0) goto out; /* l2arc read error; goto zio_read() */ if (hash_lock != NULL) mutex_enter(hash_lock); } else { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); if (HDR_L2_WRITING(hdr)) ARCSTAT_BUMP(arcstat_l2_rw_clash); spa_config_exit(spa, SCL_L2ARC, vd); } } else { if (vd != NULL) spa_config_exit(spa, SCL_L2ARC, vd); /* * Skip ARC stat bump for block pointers with * embedded data. The data are read from the blkptr * itself via decode_embedded_bp_compressed(). */ if (l2arc_ndev != 0 && !embedded_bp) { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); } } rzio = zio_read(pio, spa, bp, hdr_abd, size, arc_read_done, hdr, priority, zio_flags, zb); acb->acb_zio_head = rzio; if (hash_lock != NULL) mutex_exit(hash_lock); if (*arc_flags & ARC_FLAG_WAIT) { rc = zio_wait(rzio); goto out; } ASSERT(*arc_flags & ARC_FLAG_NOWAIT); zio_nowait(rzio); } out: /* embedded bps don't actually go to disk */ if (!embedded_bp) spa_read_history_add(spa, zb, *arc_flags); spl_fstrans_unmark(cookie); return (rc); } arc_prune_t * arc_add_prune_callback(arc_prune_func_t *func, void *private) { arc_prune_t *p; p = kmem_alloc(sizeof (*p), KM_SLEEP); p->p_pfunc = func; p->p_private = private; list_link_init(&p->p_node); zfs_refcount_create(&p->p_refcnt); mutex_enter(&arc_prune_mtx); zfs_refcount_add(&p->p_refcnt, &arc_prune_list); list_insert_head(&arc_prune_list, p); mutex_exit(&arc_prune_mtx); return (p); } void arc_remove_prune_callback(arc_prune_t *p) { boolean_t wait = B_FALSE; mutex_enter(&arc_prune_mtx); list_remove(&arc_prune_list, p); if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0) wait = B_TRUE; mutex_exit(&arc_prune_mtx); /* wait for arc_prune_task to finish */ if (wait) taskq_wait_outstanding(arc_prune_taskq, 0); ASSERT0(zfs_refcount_count(&p->p_refcnt)); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); } /* * Notify the arc that a block was freed, and thus will never be used again. */ void arc_freed(spa_t *spa, const blkptr_t *bp) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; uint64_t guid = spa_load_guid(spa); ASSERT(!BP_IS_EMBEDDED(bp)); hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; /* * We might be trying to free a block that is still doing I/O * (i.e. prefetch) or has a reference (i.e. a dedup-ed, * dmu_sync-ed block). If this block is being prefetched, then it * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr * until the I/O completes. A block may also have a reference if it is * part of a dedup-ed, dmu_synced write. The dmu_sync() function would * have written the new block to its final resting place on disk but * without the dedup flag set. This would have left the hdr in the MRU * state and discoverable. When the txg finally syncs it detects that * the block was overridden in open context and issues an override I/O. * Since this is a dedup block, the override I/O will determine if the * block is already in the DDT. If so, then it will replace the io_bp * with the bp from the DDT and allow the I/O to finish. When the I/O * reaches the done callback, dbuf_write_override_done, it will * check to see if the io_bp and io_bp_override are identical. * If they are not, then it indicates that the bp was replaced with * the bp in the DDT and the override bp is freed. This allows * us to arrive here with a reference on a block that is being * freed. So if we have an I/O in progress, or a reference to * this hdr, then we don't destroy the hdr. */ if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); mutex_exit(hash_lock); } else { mutex_exit(hash_lock); } } /* * Release this buffer from the cache, making it an anonymous buffer. This * must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ void arc_release(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * It would be nice to assert that if its DMU metadata (level > * 0 || it's the dnode file), then it must be syncing context. * But we don't know that information at this level. */ mutex_enter(&buf->b_evict_lock); ASSERT(HDR_HAS_L1HDR(hdr)); /* * We don't grab the hash lock prior to this check, because if * the buffer's header is in the arc_anon state, it won't be * linked into the hash table. */ if (hdr->b_l1hdr.b_state == arc_anon) { mutex_exit(&buf->b_evict_lock); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); ASSERT(HDR_EMPTY(hdr)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); hdr->b_l1hdr.b_arc_access = 0; /* * If the buf is being overridden then it may already * have a hdr that is not empty. */ buf_discard_identity(hdr); arc_buf_thaw(buf); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); /* * This assignment is only valid as long as the hash_lock is * held, we must be careful not to reference state or the * b_state field after dropping the lock. */ arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(state, !=, arc_anon); /* this buffer is not on any list */ ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); if (HDR_HAS_L2HDR(hdr)) { mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); /* * We have to recheck this conditional again now that * we're holding the l2ad_mtx to prevent a race with * another thread which might be concurrently calling * l2arc_evict(). In that case, l2arc_evict() might have * destroyed the header's L2 portion as we were waiting * to acquire the l2ad_mtx. */ if (HDR_HAS_L2HDR(hdr)) arc_hdr_l2hdr_destroy(hdr); mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); } /* * Do we have more than one buf? */ if (hdr->b_l1hdr.b_bufcnt > 1) { arc_buf_hdr_t *nhdr; uint64_t spa = hdr->b_spa; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t lsize = HDR_GET_LSIZE(hdr); boolean_t protected = HDR_PROTECTED(hdr); enum zio_compress compress = arc_hdr_get_compress(hdr); arc_buf_contents_t type = arc_buf_type(hdr); VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); (void) remove_reference(hdr, hash_lock, tag); if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); ASSERT(ARC_BUF_LAST(buf)); } /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. Also find the last buffer * in the hdr's buffer list. */ arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); ASSERT3P(lastbuf, !=, NULL); /* * If the current arc_buf_t and the hdr are sharing their data * buffer, then we must stop sharing that block. */ if (arc_buf_is_shared(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); VERIFY(!arc_buf_is_shared(lastbuf)); /* * First, sever the block sharing relationship between * buf and the arc_buf_hdr_t. */ arc_unshare_buf(hdr, buf); /* * Now we need to recreate the hdr's b_pabd. Since we * have lastbuf handy, we try to share with it, but if * we can't then we allocate a new b_pabd and copy the * data from buf into it. */ if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, psize); } VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end * of the list. Compressed buffers don't have the * same requirements. This makes it hard to * simply assert that the lastbuf is shared so * we rely on the hdr's compression flags to determine * if we have a compressed, shared buffer. */ ASSERT(arc_buf_is_shared(lastbuf) || arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); ASSERT(!ARC_BUF_SHARED(buf)); } ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT3P(state, !=, arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_size, arc_buf_size(buf), buf); if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); (void) zfs_refcount_remove_many( &state->arcs_esize[type], arc_buf_size(buf), buf); } hdr->b_l1hdr.b_bufcnt -= 1; if (ARC_BUF_ENCRYPTED(buf)) hdr->b_crypt_hdr.b_ebufcnt -= 1; arc_cksum_verify(buf); arc_buf_unwatch(buf); /* if this is the last uncompressed buf free the checksum */ if (!arc_hdr_has_uncompressed_buf(hdr)) arc_cksum_free(hdr); mutex_exit(hash_lock); /* * Allocate a new hdr. The new hdr will contain a b_pabd * buffer which will be freed in arc_write(). */ nhdr = arc_hdr_alloc(spa, psize, lsize, protected, compress, hdr->b_complevel, type, HDR_HAS_RABD(hdr)); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(nhdr->b_l1hdr.b_bufcnt); ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); VERIFY3U(nhdr->b_type, ==, type); ASSERT(!HDR_SHARED_DATA(nhdr)); nhdr->b_l1hdr.b_buf = buf; nhdr->b_l1hdr.b_bufcnt = 1; if (ARC_BUF_ENCRYPTED(buf)) nhdr->b_crypt_hdr.b_ebufcnt = 1; nhdr->b_l1hdr.b_mru_hits = 0; nhdr->b_l1hdr.b_mru_ghost_hits = 0; nhdr->b_l1hdr.b_mfu_hits = 0; nhdr->b_l1hdr.b_mfu_ghost_hits = 0; nhdr->b_l1hdr.b_l2_hits = 0; (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); (void) zfs_refcount_add_many(&arc_anon->arcs_size, arc_buf_size(buf), buf); } else { mutex_exit(&buf->b_evict_lock); ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); /* protected by hash lock, or hdr is on arc_anon */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; hdr->b_l1hdr.b_l2_hits = 0; arc_change_state(arc_anon, hdr, hash_lock); hdr->b_l1hdr.b_arc_access = 0; mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } } int arc_released(arc_buf_t *buf) { int released; mutex_enter(&buf->b_evict_lock); released = (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_state == arc_anon); mutex_exit(&buf->b_evict_lock); return (released); } #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { int referenced; mutex_enter(&buf->b_evict_lock); referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); mutex_exit(&buf->b_evict_lock); return (referenced); } #endif static void arc_write_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; blkptr_t *bp = zio->io_bp; uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp); fstrans_cookie_t cookie = spl_fstrans_mark(); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); ASSERT(hdr->b_l1hdr.b_bufcnt > 0); /* * If we're reexecuting this zio because the pool suspended, then * cleanup any state that was previously set the first time the * callback was invoked. */ if (zio->io_flags & ZIO_FLAG_REEXECUTED) { arc_cksum_free(hdr); arc_buf_unwatch(buf); if (hdr->b_l1hdr.b_pabd != NULL) { if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { arc_hdr_free_abd(hdr, B_FALSE); } } if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); } ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); callback->awcb_ready(zio, buf, callback->awcb_private); if (HDR_IO_IN_PROGRESS(hdr)) ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr)) hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp)); if (BP_IS_PROTECTED(bp)) { /* ZIL blocks are written through zio_rewrite */ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); ASSERT(HDR_PROTECTED(hdr)); if (BP_SHOULD_BYTESWAP(bp)) { if (BP_GET_LEVEL(bp) > 0) { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; } else { hdr->b_l1hdr.b_byteswap = DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); } } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv); zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); } /* * If this block was written for raw encryption but the zio layer * ended up only authenticating it, adjust the buffer flags now. */ if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) { arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF) buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; } else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) { buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; } /* this must be done after the buffer flags are adjusted */ arc_cksum_compute(buf); enum zio_compress compress; if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { compress = ZIO_COMPRESS_OFF; } else { ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); compress = BP_GET_COMPRESS(bp); } HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); hdr->b_complevel = zio->io_prop.zp_complevel; if (zio->io_error != 0 || psize == 0) goto out; /* * Fill the hdr with data. If the buffer is encrypted we have no choice * but to copy the data into b_radb. If the hdr is compressed, the data * we want is available from the zio, otherwise we can take it from * the buf. * * We might be able to share the buf's data with the hdr here. However, * doing so would cause the ARC to be full of linear ABDs if we write a * lot of shareable data. As a compromise, we check whether scattered * ABDs are allowed, and assume that if they are then the user wants * the ARC to be primarily filled with them regardless of the data being * written. Therefore, if they're allowed then we allocate one and copy * the data into it; otherwise, we share the data directly if we can. */ if (ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(psize, >, 0); ASSERT(ARC_BUF_COMPRESSED(buf)); arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { /* * Ideally, we would always copy the io_abd into b_pabd, but the * user may have disabled compressed ARC, thus we must check the * hdr's compression setting rather than the io_bp's. */ if (BP_IS_ENCRYPTED(bp)) { ASSERT3U(psize, >, 0); arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && !ARC_BUF_COMPRESSED(buf)) { ASSERT3U(psize, >, 0); arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } } else { ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); arc_share_buf(hdr, buf); } out: arc_hdr_verify(hdr, bp); spl_fstrans_unmark(cookie); } static void arc_write_children_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; callback->awcb_children_ready(zio, buf, callback->awcb_private); } /* * The SPA calls this callback for each physical write that happens on behalf * of a logical write. See the comment in dbuf_write_physdone() for details. */ static void arc_write_physdone(zio_t *zio) { arc_write_callback_t *cb = zio->io_private; if (cb->awcb_physdone != NULL) cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); } static void arc_write_done(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); } } else { ASSERT(HDR_EMPTY(hdr)); } /* * If the block to be written was all-zero or compressed enough to be * embedded in the BP, no write was performed so there will be no * dva/birth/checksum. The buffer must therefore remain anonymous * (and uncached). */ if (!HDR_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; ASSERT3U(zio->io_error, ==, 0); arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); if (exists != NULL) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); ASSERT(zfs_refcount_is_zero( &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists, hash_lock); arc_hdr_destroy(exists); mutex_exit(hash_lock); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { /* nopwrite */ ASSERT(zio->io_prop.zp_nopwrite); if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad nopwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); } else { /* Dedup */ ASSERT(hdr->b_l1hdr.b_bufcnt == 1); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); } ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); abd_put(zio->io_abd); kmem_free(callback, sizeof (arc_write_callback_t)); } zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, arc_write_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; zio_t *zio; zio_prop_t localprop = *zp; ASSERT3P(ready, !=, NULL); ASSERT3P(done, !=, NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (l2arc) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (ARC_BUF_ENCRYPTED(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); localprop.zp_encrypt = B_TRUE; localprop.zp_compress = HDR_GET_COMPRESS(hdr); localprop.zp_complevel = hdr->b_complevel; localprop.zp_byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; bcopy(hdr->b_crypt_hdr.b_salt, localprop.zp_salt, ZIO_DATA_SALT_LEN); bcopy(hdr->b_crypt_hdr.b_iv, localprop.zp_iv, ZIO_DATA_IV_LEN); bcopy(hdr->b_crypt_hdr.b_mac, localprop.zp_mac, ZIO_DATA_MAC_LEN); if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) { localprop.zp_nopwrite = B_FALSE; localprop.zp_copies = MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1); } zio_flags |= ZIO_FLAG_RAW; } else if (ARC_BUF_COMPRESSED(buf)) { ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); localprop.zp_compress = HDR_GET_COMPRESS(hdr); localprop.zp_complevel = hdr->b_complevel; zio_flags |= ZIO_FLAG_RAW_COMPRESS; } callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; callback->awcb_physdone = physdone; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; /* * The hdr's b_pabd is now stale, free it now. A new data block * will be allocated when the zio pipeline calls arc_write_ready(). */ if (hdr->b_l1hdr.b_pabd != NULL) { /* * If the buf is currently sharing the data block with * the hdr then we need to break that relationship here. * The hdr will remain with a NULL data pointer and the * buf will take sole ownership of the block. */ if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { arc_hdr_free_abd(hdr, B_FALSE); } VERIFY3P(buf->b_data, !=, NULL); } if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); if (!(zio_flags & ZIO_FLAG_RAW)) arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); ASSERT(!arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); zio = zio_write(pio, spa, txg, bp, abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_physdone, arc_write_done, callback, priority, zio_flags, zb); return (zio); } void arc_tempreserve_clear(uint64_t reserve) { atomic_add_64(&arc_tempreserve, -reserve); ASSERT((int64_t)arc_tempreserve >= 0); } int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) { int error; uint64_t anon_size; if (!arc_no_grow && reserve > arc_c/4 && reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT)) arc_c = MIN(arc_c_max, reserve * 4); /* * Throttle when the calculated memory footprint for the TXG * exceeds the target ARC size. */ if (reserve > arc_c) { DMU_TX_STAT_BUMP(dmu_tx_memory_reserve); return (SET_ERROR(ERESTART)); } /* * Don't count loaned bufs as in flight dirty data to prevent long * network delays from blocking transactions that are ready to be * assigned to a txg. */ /* assert that it has not wrapped around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) - arc_loaned_bytes), 0); /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefore need to * make sure that there is sufficient available memory for this. */ error = arc_memory_throttle(spa, reserve, txg); if (error != 0) return (error); /* * Throttle writes when the amount of dirty data in the cache * gets too large. We try to keep the cache less than half full * of dirty blocks so that our sync times don't grow too large. * * In the case of one pool being built on another pool, we want * to make sure we don't end up throttling the lower (backing) * pool when the upper pool is the majority contributor to dirty * data. To insure we make forward progress during throttling, we * also check the current pool's net dirty data and only throttle * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty * data in the cache. * * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ uint64_t total_dirty = reserve + arc_tempreserve + anon_size; uint64_t spa_dirty_anon = spa_dirty_data(spa); if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 && anon_size > arc_c * zfs_arc_anon_limit_percent / 100 && spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { #ifdef ZFS_DEBUG uint64_t meta_esize = zfs_refcount_count( &arc_anon->arcs_esize[ARC_BUFC_METADATA]); uint64_t data_esize = zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", arc_tempreserve >> 10, meta_esize >> 10, data_esize >> 10, reserve >> 10, arc_c >> 10); #endif DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); return (SET_ERROR(ERESTART)); } atomic_add_64(&arc_tempreserve, reserve); return (0); } static void arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { size->value.ui64 = zfs_refcount_count(&state->arcs_size); evict_data->value.ui64 = zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); evict_metadata->value.ui64 = zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); } static int arc_kstat_update(kstat_t *ksp, int rw) { arc_stats_t *as = ksp->ks_data; if (rw == KSTAT_WRITE) { return (SET_ERROR(EACCES)); } else { arc_kstat_update_state(arc_anon, &as->arcstat_anon_size, &as->arcstat_anon_evictable_data, &as->arcstat_anon_evictable_metadata); arc_kstat_update_state(arc_mru, &as->arcstat_mru_size, &as->arcstat_mru_evictable_data, &as->arcstat_mru_evictable_metadata); arc_kstat_update_state(arc_mru_ghost, &as->arcstat_mru_ghost_size, &as->arcstat_mru_ghost_evictable_data, &as->arcstat_mru_ghost_evictable_metadata); arc_kstat_update_state(arc_mfu, &as->arcstat_mfu_size, &as->arcstat_mfu_evictable_data, &as->arcstat_mfu_evictable_metadata); arc_kstat_update_state(arc_mfu_ghost, &as->arcstat_mfu_ghost_size, &as->arcstat_mfu_ghost_evictable_data, &as->arcstat_mfu_ghost_evictable_metadata); ARCSTAT(arcstat_size) = aggsum_value(&arc_size); ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used); ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size); ARCSTAT(arcstat_metadata_size) = aggsum_value(&astat_metadata_size); ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size); ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size); ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size); #if defined(COMPAT_FREEBSD11) ARCSTAT(arcstat_other_size) = aggsum_value(&astat_bonus_size) + aggsum_value(&astat_dnode_size) + aggsum_value(&astat_dbuf_size); #endif ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size); ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size); ARCSTAT(arcstat_abd_chunk_waste_size) = aggsum_value(&astat_abd_chunk_waste_size); as->arcstat_memory_all_bytes.value.ui64 = arc_all_memory(); as->arcstat_memory_free_bytes.value.ui64 = arc_free_memory(); as->arcstat_memory_available_bytes.value.i64 = arc_available_memory(); } return (0); } /* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the ARC eviction * code is laid out; arc_evict_state() assumes ARC buffers are evenly * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ static unsigned int arc_state_multilist_index_func(multilist_t *ml, void *obj) { arc_buf_hdr_t *hdr = obj; /* * We rely on b_dva to generate evenly distributed index * numbers using buf_hash below. So, as an added precaution, * let's make sure we never add empty buffers to the arc lists. */ ASSERT(!HDR_EMPTY(hdr)); /* * The assumption here, is the hash value for a given * arc_buf_hdr_t will remain constant throughout its lifetime * (i.e. its b_spa, b_dva, and b_birth fields don't change). * Thus, we don't need to store the header's sublist index * on insertion, as this index can be recalculated on removal. * * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. */ return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % multilist_get_num_sublists(ml)); } #define WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do { \ if ((do_warn) && (tuning) && ((tuning) != (value))) { \ cmn_err(CE_WARN, \ "ignoring tunable %s (using %llu instead)", \ (#tuning), (value)); \ } \ } while (0) /* * Called during module initialization and periodically thereafter to * apply reasonable changes to the exposed performance tunings. Can also be * called explicitly by param_set_arc_*() functions when ARC tunables are * updated manually. Non-zero zfs_* values which differ from the currently set * values will be applied. */ void arc_tuning_update(boolean_t verbose) { uint64_t allmem = arc_all_memory(); unsigned long limit; /* Valid range: 32M - */ if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) && (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) && (zfs_arc_min <= arc_c_max)) { arc_c_min = zfs_arc_min; arc_c = MAX(arc_c, arc_c_min); } WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose); /* Valid range: 64M - */ if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) && (zfs_arc_max >= 64 << 20) && (zfs_arc_max < allmem) && (zfs_arc_max > arc_c_min)) { arc_c_max = zfs_arc_max; arc_c = MIN(arc_c, arc_c_max); arc_p = (arc_c >> 1); if (arc_meta_limit > arc_c_max) arc_meta_limit = arc_c_max; if (arc_dnode_size_limit > arc_meta_limit) arc_dnode_size_limit = arc_meta_limit; } WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose); /* Valid range: 16M - */ if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) && (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) && (zfs_arc_meta_min <= arc_c_max)) { arc_meta_min = zfs_arc_meta_min; if (arc_meta_limit < arc_meta_min) arc_meta_limit = arc_meta_min; if (arc_dnode_size_limit < arc_meta_min) arc_dnode_size_limit = arc_meta_min; } WARN_IF_TUNING_IGNORED(zfs_arc_meta_min, arc_meta_min, verbose); /* Valid range: - */ limit = zfs_arc_meta_limit ? zfs_arc_meta_limit : MIN(zfs_arc_meta_limit_percent, 100) * arc_c_max / 100; if ((limit != arc_meta_limit) && (limit >= arc_meta_min) && (limit <= arc_c_max)) arc_meta_limit = limit; WARN_IF_TUNING_IGNORED(zfs_arc_meta_limit, arc_meta_limit, verbose); /* Valid range: - */ limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit : MIN(zfs_arc_dnode_limit_percent, 100) * arc_meta_limit / 100; if ((limit != arc_dnode_size_limit) && (limit >= arc_meta_min) && (limit <= arc_meta_limit)) arc_dnode_size_limit = limit; WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_size_limit, verbose); /* Valid range: 1 - N */ if (zfs_arc_grow_retry) arc_grow_retry = zfs_arc_grow_retry; /* Valid range: 1 - N */ if (zfs_arc_shrink_shift) { arc_shrink_shift = zfs_arc_shrink_shift; arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1); } /* Valid range: 1 - N */ if (zfs_arc_p_min_shift) arc_p_min_shift = zfs_arc_p_min_shift; /* Valid range: 1 - N ms */ if (zfs_arc_min_prefetch_ms) arc_min_prefetch_ms = zfs_arc_min_prefetch_ms; /* Valid range: 1 - N ms */ if (zfs_arc_min_prescient_prefetch_ms) { arc_min_prescient_prefetch_ms = zfs_arc_min_prescient_prefetch_ms; } /* Valid range: 0 - 100 */ if ((zfs_arc_lotsfree_percent >= 0) && (zfs_arc_lotsfree_percent <= 100)) arc_lotsfree_percent = zfs_arc_lotsfree_percent; WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent, verbose); /* Valid range: 0 - */ if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free)) arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), allmem); WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose); } static void arc_state_init(void) { arc_anon = &ARC_anon; arc_mru = &ARC_mru; arc_mru_ghost = &ARC_mru_ghost; arc_mfu = &ARC_mfu; arc_mfu_ghost = &ARC_mfu_ghost; arc_l2c_only = &ARC_l2c_only; arc_mru->arcs_list[ARC_BUFC_METADATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); arc_mru->arcs_list[ARC_BUFC_DATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); arc_mru_ghost->arcs_list[ARC_BUFC_DATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); arc_mfu->arcs_list[ARC_BUFC_METADATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); arc_mfu->arcs_list[ARC_BUFC_DATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); arc_l2c_only->arcs_list[ARC_BUFC_METADATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); arc_l2c_only->arcs_list[ARC_BUFC_DATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_anon->arcs_size); zfs_refcount_create(&arc_mru->arcs_size); zfs_refcount_create(&arc_mru_ghost->arcs_size); zfs_refcount_create(&arc_mfu->arcs_size); zfs_refcount_create(&arc_mfu_ghost->arcs_size); zfs_refcount_create(&arc_l2c_only->arcs_size); aggsum_init(&arc_meta_used, 0); aggsum_init(&arc_size, 0); aggsum_init(&astat_data_size, 0); aggsum_init(&astat_metadata_size, 0); aggsum_init(&astat_hdr_size, 0); aggsum_init(&astat_l2_hdr_size, 0); aggsum_init(&astat_bonus_size, 0); aggsum_init(&astat_dnode_size, 0); aggsum_init(&astat_dbuf_size, 0); aggsum_init(&astat_abd_chunk_waste_size, 0); arc_anon->arcs_state = ARC_STATE_ANON; arc_mru->arcs_state = ARC_STATE_MRU; arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; arc_mfu->arcs_state = ARC_STATE_MFU; arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; } static void arc_state_fini(void) { zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_anon->arcs_size); zfs_refcount_destroy(&arc_mru->arcs_size); zfs_refcount_destroy(&arc_mru_ghost->arcs_size); zfs_refcount_destroy(&arc_mfu->arcs_size); zfs_refcount_destroy(&arc_mfu_ghost->arcs_size); zfs_refcount_destroy(&arc_l2c_only->arcs_size); multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]); multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]); multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]); aggsum_fini(&arc_meta_used); aggsum_fini(&arc_size); aggsum_fini(&astat_data_size); aggsum_fini(&astat_metadata_size); aggsum_fini(&astat_hdr_size); aggsum_fini(&astat_l2_hdr_size); aggsum_fini(&astat_bonus_size); aggsum_fini(&astat_dnode_size); aggsum_fini(&astat_dbuf_size); aggsum_fini(&astat_abd_chunk_waste_size); } uint64_t arc_target_bytes(void) { return (arc_c); } void arc_init(void) { uint64_t percent, allmem = arc_all_memory(); mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t), offsetof(arc_evict_waiter_t, aew_node)); arc_min_prefetch_ms = 1000; arc_min_prescient_prefetch_ms = 6000; #if defined(_KERNEL) arc_lowmem_init(); #endif /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */ arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT); /* How to set default max varies by platform. */ arc_c_max = arc_default_max(arc_c_min, allmem); #ifndef _KERNEL /* * In userland, there's only the memory pressure that we artificially * create (see arc_available_memory()). Don't let arc_c get too * small, because it can cause transactions to be larger than * arc_c, causing arc_tempreserve_space() to fail. */ arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT); #endif arc_c = arc_c_min; arc_p = (arc_c >> 1); /* Set min to 1/2 of arc_c_min */ arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT; /* Initialize maximum observed usage to zero */ arc_meta_max = 0; /* * Set arc_meta_limit to a percent of arc_c_max with a floor of * arc_meta_min, and a ceiling of arc_c_max. */ percent = MIN(zfs_arc_meta_limit_percent, 100); arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100); percent = MIN(zfs_arc_dnode_limit_percent, 100); arc_dnode_size_limit = (percent * arc_meta_limit) / 100; /* Apply user specified tunings */ arc_tuning_update(B_TRUE); /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; if (arc_c < arc_c_min) arc_c = arc_c_min; arc_state_init(); buf_init(); list_create(&arc_prune_list, sizeof (arc_prune_t), offsetof(arc_prune_t, p_node)); mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); arc_prune_taskq = taskq_create("arc_prune", boot_ncpus, defclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (arc_ksp != NULL) { arc_ksp->ks_data = &arc_stats; arc_ksp->ks_update = arc_kstat_update; kstat_install(arc_ksp); } arc_evict_zthr = zthr_create_timer("arc_evict", arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1)); arc_reap_zthr = zthr_create_timer("arc_reap", arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1)); arc_warm = B_FALSE; /* * Calculate maximum amount of dirty data per pool. * * If it has been set by a module parameter, take that. * Otherwise, use a percentage of physical memory defined by * zfs_dirty_data_max_percent (default 10%) with a cap at * zfs_dirty_data_max_max (default 4G or 25% of physical memory). */ #ifdef __LP64__ if (zfs_dirty_data_max_max == 0) zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024, allmem * zfs_dirty_data_max_max_percent / 100); #else if (zfs_dirty_data_max_max == 0) zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024, allmem * zfs_dirty_data_max_max_percent / 100); #endif if (zfs_dirty_data_max == 0) { zfs_dirty_data_max = allmem * zfs_dirty_data_max_percent / 100; zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } } void arc_fini(void) { arc_prune_t *p; #ifdef _KERNEL arc_lowmem_fini(); #endif /* _KERNEL */ /* Use B_TRUE to ensure *all* buffers are evicted */ arc_flush(NULL, B_TRUE); if (arc_ksp != NULL) { kstat_delete(arc_ksp); arc_ksp = NULL; } taskq_wait(arc_prune_taskq); taskq_destroy(arc_prune_taskq); mutex_enter(&arc_prune_mtx); while ((p = list_head(&arc_prune_list)) != NULL) { list_remove(&arc_prune_list, p); zfs_refcount_remove(&p->p_refcnt, &arc_prune_list); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); } mutex_exit(&arc_prune_mtx); list_destroy(&arc_prune_list); mutex_destroy(&arc_prune_mtx); (void) zthr_cancel(arc_evict_zthr); (void) zthr_cancel(arc_reap_zthr); mutex_destroy(&arc_evict_lock); list_destroy(&arc_evict_waiters); /* * Free any buffers that were tagged for destruction. This needs * to occur before arc_state_fini() runs and destroys the aggsum * values which are updated when freeing scatter ABDs. */ l2arc_do_free_on_write(); /* * buf_fini() must proceed arc_state_fini() because buf_fin() may * trigger the release of kmem magazines, which can callback to * arc_space_return() which accesses aggsums freed in act_state_fini(). */ buf_fini(); arc_state_fini(); /* * We destroy the zthrs after all the ARC state has been * torn down to avoid the case of them receiving any * wakeup() signals after they are destroyed. */ zthr_destroy(arc_evict_zthr); zthr_destroy(arc_reap_zthr); ASSERT0(arc_loaned_bytes); } /* * Level 2 ARC * * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. * It uses dedicated storage devices to hold cached data, which are populated * using large infrequent writes. The main role of this cache is to boost * the performance of random read workloads. The intended L2ARC devices * include short-stroked disks, solid state disks, and other media with * substantially faster read latency than disk. * * +-----------------------+ * | ARC | * +-----------------------+ * | ^ ^ * | | | * l2arc_feed_thread() arc_read() * | | | * | l2arc read | * V | | * +---------------+ | * | L2ARC | | * +---------------+ | * | ^ | * l2arc_write() | | * | | | * V | | * +-------+ +-------+ * | vdev | | vdev | * | cache | | cache | * +-------+ +-------+ * +=========+ .-----. * : L2ARC : |-_____-| * : devices : | Disks | * +=========+ `-_____-' * * Read requests are satisfied from the following sources, in order: * * 1) ARC * 2) vdev cache of L2ARC devices * 3) L2ARC devices * 4) vdev cache of disks * 5) disks * * Some L2ARC device types exhibit extremely slow write performance. * To accommodate for this there are some significant differences between * the L2ARC and traditional cache design: * * 1. There is no eviction path from the ARC to the L2ARC. Evictions from * the ARC behave as usual, freeing buffers and placing headers on ghost * lists. The ARC does not send buffers to the L2ARC during eviction as * this would add inflated write latencies for all ARC memory pressure. * * 2. The L2ARC attempts to cache data from the ARC before it is evicted. * It does this by periodically scanning buffers from the eviction-end of * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are * not already there. It scans until a headroom of buffers is satisfied, * which itself is a buffer for ARC eviction. If a compressible buffer is * found during scanning and selected for writing to an L2ARC device, we * temporarily boost scanning headroom during the next scan cycle to make * sure we adapt to compression effects (which might significantly reduce * the data volume we write to L2ARC). The thread that does this is * l2arc_feed_thread(), illustrated below; example sizes are included to * provide a better sense of ratio than this diagram: * * head --> tail * +---------------------+----------+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC * +---------------------+----------+ | o L2ARC eligible * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer * +---------------------+----------+ | * 15.9 Gbytes ^ 32 Mbytes | * headroom | * l2arc_feed_thread() * | * l2arc write hand <--[oooo]--' * | 8 Mbyte * | write max * V * +==============================+ * L2ARC dev |####|#|###|###| |####| ... | * +==============================+ * 32 Gbytes * * 3. If an ARC buffer is copied to the L2ARC but then hit instead of * evicted, then the L2ARC has cached a buffer much sooner than it probably * needed to, potentially wasting L2ARC device bandwidth and storage. It is * safe to say that this is an uncommon case, since buffers at the end of * the ARC lists have moved there due to inactivity. * * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, * then the L2ARC simply misses copying some buffers. This serves as a * pressure valve to prevent heavy read workloads from both stalling the ARC * with waits and clogging the L2ARC with writes. This also helps prevent * the potential for the L2ARC to churn if it attempts to cache content too * quickly, such as during backups of the entire pool. * * 5. After system boot and before the ARC has filled main memory, there are * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru * lists can remain mostly static. Instead of searching from tail of these * lists as pictured, the l2arc_feed_thread() will search from the list heads * for eligible buffers, greatly increasing its chance of finding them. * * The L2ARC device write speed is also boosted during this time so that * the L2ARC warms up faster. Since there have been no ARC evictions yet, * there are no L2ARC reads, and no fear of degrading read performance * through increased writes. * * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that * the vdev queue can aggregate them into larger and fewer writes. Each * device is written to in a rotor fashion, sweeping writes through * available space then repeating. * * 7. The L2ARC does not store dirty content. It never needs to flush * write buffers back to disk based storage. * * 8. If an ARC buffer is written (and dirtied) which also exists in the * L2ARC, the now stale L2ARC buffer is immediately dropped. * * The performance of the L2ARC can be tweaked by a number of tunables, which * may be necessary for different workloads: * * l2arc_write_max max write bytes per interval * l2arc_write_boost extra write bytes during device warmup * l2arc_noprefetch skip caching prefetched buffers * l2arc_headroom number of max device writes to precache * l2arc_headroom_boost when we find compressed buffers during ARC * scanning, we multiply headroom by this * percentage factor for the next scan cycle, * since more compressed buffers are likely to * be present * l2arc_feed_secs seconds between L2ARC writing * * Tunables may be removed or added as future performance improvements are * integrated, and also may become zpool properties. * * There are three key functions that control how the L2ARC warms up: * * l2arc_write_eligible() check if a buffer is eligible to cache * l2arc_write_size() calculate how much to write * l2arc_write_interval() calculate sleep delay between writes * * These three functions determine what to write, how much, and how quickly * to send writes. * * L2ARC persistence: * * When writing buffers to L2ARC, we periodically add some metadata to * make sure we can pick them up after reboot, thus dramatically reducing * the impact that any downtime has on the performance of storage systems * with large caches. * * The implementation works fairly simply by integrating the following two * modifications: * * *) When writing to the L2ARC, we occasionally write a "l2arc log block", * which is an additional piece of metadata which describes what's been * written. This allows us to rebuild the arc_buf_hdr_t structures of the * main ARC buffers. There are 2 linked-lists of log blocks headed by * dh_start_lbps[2]. We alternate which chain we append to, so they are * time-wise and offset-wise interleaved, but that is an optimization rather * than for correctness. The log block also includes a pointer to the * previous block in its chain. * * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device * for our header bookkeeping purposes. This contains a device header, * which contains our top-level reference structures. We update it each * time we write a new log block, so that we're able to locate it in the * L2ARC device. If this write results in an inconsistent device header * (e.g. due to power failure), we detect this by verifying the header's * checksum and simply fail to reconstruct the L2ARC after reboot. * * Implementation diagram: * * +=== L2ARC device (not to scale) ======================================+ * | ___two newest log block pointers__.__________ | * | / \dh_start_lbps[1] | * | / \ \dh_start_lbps[0]| * |.___/__. V V | * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---| * || hdr| ^ /^ /^ / / | * |+------+ ...--\-------/ \-----/--\------/ / | * | \--------------/ \--------------/ | * +======================================================================+ * * As can be seen on the diagram, rather than using a simple linked list, * we use a pair of linked lists with alternating elements. This is a * performance enhancement due to the fact that we only find out the * address of the next log block access once the current block has been * completely read in. Obviously, this hurts performance, because we'd be * keeping the device's I/O queue at only a 1 operation deep, thus * incurring a large amount of I/O round-trip latency. Having two lists * allows us to fetch two log blocks ahead of where we are currently * rebuilding L2ARC buffers. * * On-device data structures: * * L2ARC device header: l2arc_dev_hdr_phys_t * L2ARC log block: l2arc_log_blk_phys_t * * L2ARC reconstruction: * * When writing data, we simply write in the standard rotary fashion, * evicting buffers as we go and simply writing new data over them (writing * a new log block every now and then). This obviously means that once we * loop around the end of the device, we will start cutting into an already * committed log block (and its referenced data buffers), like so: * * current write head__ __old tail * \ / * V V * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |--> * ^ ^^^^^^^^^___________________________________ * | \ * <> may overwrite this blk and/or its bufs --' * * When importing the pool, we detect this situation and use it to stop * our scanning process (see l2arc_rebuild). * * There is one significant caveat to consider when rebuilding ARC contents * from an L2ARC device: what about invalidated buffers? Given the above * construction, we cannot update blocks which we've already written to amend * them to remove buffers which were invalidated. Thus, during reconstruction, * we might be populating the cache with buffers for data that's not on the * main pool anymore, or may have been overwritten! * * As it turns out, this isn't a problem. Every arc_read request includes * both the DVA and, crucially, the birth TXG of the BP the caller is * looking for. So even if the cache were populated by completely rotten * blocks for data that had been long deleted and/or overwritten, we'll * never actually return bad data from the cache, since the DVA with the * birth TXG uniquely identify a block in space and time - once created, * a block is immutable on disk. The worst thing we have done is wasted * some time and memory at l2arc rebuild to reconstruct outdated ARC * entries that will get dropped from the l2arc as it is being updated * with new blocks. * * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write * hand are not restored. This is done by saving the offset (in bytes) * l2arc_evict() has evicted to in the L2ARC device header and taking it * into account when restoring buffers. */ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) { /* * A buffer is *not* eligible for the L2ARC if it: * 1. belongs to a different spa. * 2. is already cached on the L2ARC. * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) return (B_FALSE); return (B_TRUE); } static uint64_t l2arc_write_size(l2arc_dev_t *dev) { uint64_t size, dev_size, tsize; /* * Make sure our globals have meaningful values in case the user * altered them. */ size = l2arc_write_max; if (size == 0) { cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " "be greater than zero, resetting it to the default (%d)", L2ARC_WRITE_SIZE); size = l2arc_write_max = L2ARC_WRITE_SIZE; } if (arc_warm == B_FALSE) size += l2arc_write_boost; /* * Make sure the write size does not exceed the size of the cache * device. This is important in l2arc_evict(), otherwise infinite * iteration can occur. */ dev_size = dev->l2ad_end - dev->l2ad_start; tsize = size + l2arc_log_blk_overhead(size, dev); if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) tsize += MAX(64 * 1024 * 1024, (tsize * l2arc_trim_ahead) / 100); if (tsize >= dev_size) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " "plus the overhead of log blocks (persistent L2ARC, " "%llu bytes) exceeds the size of the cache device " "(guid %llu), resetting them to the default (%d)", l2arc_log_blk_overhead(size, dev), dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; if (arc_warm == B_FALSE) size += l2arc_write_boost; } return (size); } static clock_t l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) { clock_t interval, next, now; /* * If the ARC lists are busy, increase our write rate; if the * lists are stale, idle back. This is achieved by checking * how much we previously wrote - if it was more than half of * what we wanted, schedule the next write much sooner. */ if (l2arc_feed_again && wrote > (wanted / 2)) interval = (hz * l2arc_feed_min_ms) / 1000; else interval = hz * l2arc_feed_secs; now = ddi_get_lbolt(); next = MAX(now, MIN(now + interval, began + interval)); return (next); } /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. */ static l2arc_dev_t * l2arc_dev_get_next(void) { l2arc_dev_t *first, *next = NULL; /* * Lock out the removal of spas (spa_namespace_lock), then removal * of cache devices (l2arc_dev_mtx). Once a device has been selected, * both locks will be dropped and a spa config lock held instead. */ mutex_enter(&spa_namespace_lock); mutex_enter(&l2arc_dev_mtx); /* if there are no vdevs, there is nothing to do */ if (l2arc_ndev == 0) goto out; first = NULL; next = l2arc_dev_last; do { /* loop around the list looking for a non-faulted vdev */ if (next == NULL) { next = list_head(l2arc_dev_list); } else { next = list_next(l2arc_dev_list, next); if (next == NULL) next = list_head(l2arc_dev_list); } /* if we have come back to the start, bail out */ if (first == NULL) first = next; else if (next == first) break; } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || next->l2ad_trim_all); /* if we were unable to find any usable vdevs, return NULL */ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || next->l2ad_trim_all) next = NULL; l2arc_dev_last = next; out: mutex_exit(&l2arc_dev_mtx); /* * Grab the config lock to prevent the 'next' device from being * removed while we are writing to it. */ if (next != NULL) spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); mutex_exit(&spa_namespace_lock); return (next); } /* * Free buffers that were tagged for destruction. */ static void l2arc_do_free_on_write(void) { list_t *buflist; l2arc_data_free_t *df, *df_prev; mutex_enter(&l2arc_free_on_write_mtx); buflist = l2arc_free_on_write; for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); ASSERT3P(df->l2df_abd, !=, NULL); abd_free(df->l2df_abd); list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } mutex_exit(&l2arc_free_on_write_mtx); } /* * A write to a cache device has completed. Update all headers to allow * reads from these buffers to begin. */ static void l2arc_write_done(zio_t *zio) { l2arc_write_callback_t *cb; l2arc_lb_abd_buf_t *abd_buf; l2arc_lb_ptr_buf_t *lb_ptr_buf; l2arc_dev_t *dev; l2arc_dev_hdr_phys_t *l2dhdr; list_t *buflist; arc_buf_hdr_t *head, *hdr, *hdr_prev; kmutex_t *hash_lock; int64_t bytes_dropped = 0; cb = zio->io_private; ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; ASSERT3P(head, !=, NULL); buflist = &dev->l2ad_buflist; ASSERT3P(buflist, !=, NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); if (zio->io_error != 0) ARCSTAT_BUMP(arcstat_l2_writes_error); /* * All writes completed, or an error was hit. */ top: mutex_enter(&dev->l2ad_mtx); for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. We must retry so we * don't leave the ARC_FLAG_L2_WRITING bit set. */ ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); /* * We don't want to rescan the headers we've * already marked as having been written out, so * we reinsert the head node so we can pick up * where we left off. */ list_remove(buflist, head); list_insert_after(buflist, hdr, head); mutex_exit(&dev->l2ad_mtx); /* * We wait for the hash lock to become available * to try and prevent busy waiting, and increase * the chance we'll be able to acquire the lock * the next time around. */ mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } /* * We could not have been moved into the arc_l2c_only * state while in-flight due to our ARC_FLAG_L2_WRITING * bit being set. Let's just ensure that's being enforced. */ ASSERT(HDR_HAS_L1HDR(hdr)); /* * Skipped - drop L2ARC entry and mark the header as no * longer L2 eligibile. */ if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); uint64_t psize = HDR_GET_PSIZE(hdr); ARCSTAT_INCR(arcstat_l2_psize, -psize); ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); bytes_dropped += vdev_psize_to_asize(dev->l2ad_vdev, psize); (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); } /* * Allow ARC to begin reads and ghost list evictions to * this L2ARC entry. */ arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); } /* * Free the allocated abd buffers for writing the log blocks. * If the zio failed reclaim the allocated space and remove the * pointers to these log blocks from the log block pointer list * of the L2ARC device. */ while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) { abd_free(abd_buf->abd); zio_buf_free(abd_buf, sizeof (*abd_buf)); if (zio->io_error != 0) { lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list); /* * L2BLK_GET_PSIZE returns aligned size for log * blocks. */ uint64_t asize = L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop); bytes_dropped += asize; ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); } } list_destroy(&cb->l2wcb_abd_list); if (zio->io_error != 0) { /* * Restore the lbps array in the header to its previous state. * If the list of log block pointers is empty, zero out the * log block pointers in the device header. */ lb_ptr_buf = list_head(&dev->l2ad_lbptr_list); for (int i = 0; i < 2; i++) { if (lb_ptr_buf == NULL) { /* * If the list is empty zero out the device * header. Otherwise zero out the second log * block pointer in the header. */ if (i == 0) { bzero(l2dhdr, dev->l2ad_dev_hdr_asize); } else { bzero(&l2dhdr->dh_start_lbps[i], sizeof (l2arc_log_blkptr_t)); } break; } bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i], sizeof (l2arc_log_blkptr_t)); lb_ptr_buf = list_next(&dev->l2ad_lbptr_list, lb_ptr_buf); } } atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); mutex_exit(&dev->l2ad_mtx); ASSERT(dev->l2ad_vdev != NULL); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); kmem_free(cb, sizeof (l2arc_write_callback_t)); } static int l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) { int ret; spa_t *spa = zio->io_spa; arc_buf_hdr_t *hdr = cb->l2rcb_hdr; blkptr_t *bp = zio->io_bp; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; /* * ZIL data is never be written to the L2ARC, so we don't need * special handling for its unique MAC storage. */ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * If the data was encrypted, decrypt it now. Note that * we must check the bp here and not the hdr, since the * hdr does not have its encryption parameters updated * until arc_read_done(). */ if (BP_IS_ENCRYPTED(bp)) { abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, B_TRUE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, HDR_GET_PSIZE(hdr), eabd, hdr->b_l1hdr.b_pabd, &no_crypt); if (ret != 0) { arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); goto error; } /* * If we actually performed decryption, replace b_pabd * with the decrypted data. Otherwise we can just throw * our decryption buffer away. */ if (!no_crypt) { arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = eabd; zio->io_abd = eabd; } else { arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); } } /* * If the L2ARC block was compressed, but ARC compression * is disabled we decompress the data into a new buffer and * replace the existing data. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, B_TRUE); void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr); goto error; } abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = cabd; zio->io_abd = cabd; zio->io_size = HDR_GET_LSIZE(hdr); } return (0); error: return (ret); } /* * A read to a cache device completed. Validate buffer contents before * handing over to the regular ARC routines. */ static void l2arc_read_done(zio_t *zio) { int tfm_error = 0; l2arc_read_callback_t *cb = zio->io_private; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; boolean_t valid_cksum; boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) && (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT)); ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); ASSERT3P(cb, !=, NULL); hdr = cb->l2rcb_hdr; ASSERT3P(hdr, !=, NULL); hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* * If the data was read into a temporary buffer, * move it and free the buffer. */ if (cb->l2rcb_abd != NULL) { ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); if (zio->io_error == 0) { if (using_rdata) { abd_copy(hdr->b_crypt_hdr.b_rabd, cb->l2rcb_abd, arc_hdr_size(hdr)); } else { abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd, arc_hdr_size(hdr)); } } /* * The following must be done regardless of whether * there was an error: * - free the temporary buffer * - point zio to the real ARC buffer * - set zio size accordingly * These are required because zio is either re-used for * an I/O of the block in the case of the error * or the zio is passed to arc_read_done() and it * needs real data. */ abd_free(cb->l2rcb_abd); zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); if (using_rdata) { ASSERT(HDR_HAS_RABD(hdr)); zio->io_abd = zio->io_orig_abd = hdr->b_crypt_hdr.b_rabd; } else { ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; } } ASSERT3P(zio->io_abd, !=, NULL); /* * Check this survived the L2ARC journey. */ ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd || (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd)); zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ zio->io_prop.zp_complevel = hdr->b_complevel; valid_cksum = arc_cksum_is_equal(hdr, zio); /* * b_rabd will always match the data as it exists on disk if it is * being used. Therefore if we are reading into b_rabd we do not * attempt to untransform the data. */ if (valid_cksum && !using_rdata) tfm_error = l2arc_untransform(zio, cb); if (valid_cksum && tfm_error == 0 && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); zio->io_private = hdr; arc_read_done(zio); } else { /* * Buffer didn't survive caching. Increment stats and * reissue to the original storage device. */ if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { zio->io_error = SET_ERROR(EIO); } if (!valid_cksum || tfm_error != 0) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* * If there's no waiter, issue an async i/o to the primary * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ if (zio->io_waiter == NULL) { zio_t *pio = zio_unique_parent(zio); void *abd = (using_rdata) ? hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd; ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio = zio_read(pio, zio->io_spa, zio->io_bp, abd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb); /* * Original ZIO will be freed, so we need to update * ARC header with the new ZIO pointer to be used * by zio_change_priority() in arc_read(). */ for (struct arc_callback *acb = hdr->b_l1hdr.b_acb; acb != NULL; acb = acb->acb_next) acb->acb_zio_head = zio; mutex_exit(hash_lock); zio_nowait(zio); } else { mutex_exit(hash_lock); } } kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * This is the list priority from which the L2ARC will search for pages to * cache. This is used within loops (0..3) to cycle through lists in the * desired order. This order can have a significant effect on cache * performance. * * Currently the metadata lists are hit first, MFU then MRU, followed by * the data lists. This function returns a locked list, and also returns * the lock pointer. */ static multilist_sublist_t * l2arc_sublist_lock(int list_num) { multilist_t *ml = NULL; unsigned int idx; ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES); switch (list_num) { case 0: ml = arc_mfu->arcs_list[ARC_BUFC_METADATA]; break; case 1: ml = arc_mru->arcs_list[ARC_BUFC_METADATA]; break; case 2: ml = arc_mfu->arcs_list[ARC_BUFC_DATA]; break; case 3: ml = arc_mru->arcs_list[ARC_BUFC_DATA]; break; default: return (NULL); } /* * Return a randomly-selected sublist. This is acceptable * because the caller feeds only a little bit of data for each * call (8MB). Subsequent calls will result in different * sublists being selected. */ idx = multilist_get_random_index(ml); return (multilist_sublist_lock(ml, idx)); } /* * Calculates the maximum overhead of L2ARC metadata log blocks for a given * L2ARC write size. l2arc_evict and l2arc_write_size need to include this * overhead in processing to make sure there is enough headroom available * when writing buffers. */ static inline uint64_t l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev) { if (dev->l2ad_log_entries == 0) { return (0); } else { uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT; uint64_t log_blocks = (log_entries + dev->l2ad_log_entries - 1) / dev->l2ad_log_entries; return (vdev_psize_to_asize(dev->l2ad_vdev, sizeof (l2arc_log_blk_phys_t)) * log_blocks); } } /* * Evict buffers from the device write hand to the distance specified in * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev; vdev_t *vd = dev->l2ad_vdev; boolean_t rerun; buflist = &dev->l2ad_buflist; /* * We need to add in the worst case scenario of log block overhead. */ distance += l2arc_log_blk_overhead(distance, dev); if (vd->vdev_has_trim && l2arc_trim_ahead > 0) { /* * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) * times the write size, whichever is greater. */ distance += MAX(64 * 1024 * 1024, (distance * l2arc_trim_ahead) / 100); } top: rerun = B_FALSE; if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { /* * When there is no space to accommodate upcoming writes, * evict to the end. Then bump the write and evict hands * to the start and iterate. This iteration does not * happen indefinitely as we make sure in * l2arc_write_size() that when the write hand is reset, * the write size does not exceed the end of the device. */ rerun = B_TRUE; taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; } DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); if (!all) { /* * This check has to be placed after deciding whether to * iterate (rerun). */ if (dev->l2ad_first) { /* * This is the first sweep through the device. There is * nothing to evict. We have already trimmmed the * whole device. */ goto out; } else { /* * Trim the space to be evicted. */ if (vd->vdev_has_trim && dev->l2ad_evict < taddr && l2arc_trim_ahead > 0) { /* * We have to drop the spa_config lock because * vdev_trim_range() will acquire it. * l2ad_evict already accounts for the label * size. To prevent vdev_trim_ranges() from * adding it again, we subtract it from * l2ad_evict. */ spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev); vdev_trim_simple(vd, dev->l2ad_evict - VDEV_LABEL_START_SIZE, taddr - dev->l2ad_evict); spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev, RW_READER); } /* * When rebuilding L2ARC we retrieve the evict hand * from the header of the device. Of note, l2arc_evict() * does not actually delete buffers from the cache * device, but trimming may do so depending on the * hardware implementation. Thus keeping track of the * evict hand is useful. */ dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); } } retry: mutex_enter(&dev->l2ad_mtx); /* * We have to account for evicted log blocks. Run vdev_space_update() * on log blocks whose offset (in bytes) is before the evicted offset * (in bytes) by searching in the list of pointers to log blocks * present in the L2ARC device. */ for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf; lb_ptr_buf = lb_ptr_buf_prev) { lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf); /* L2BLK_GET_PSIZE returns aligned size for log blocks */ uint64_t asize = L2BLK_GET_PSIZE( (lb_ptr_buf->lb_ptr)->lbp_prop); /* * We don't worry about log blocks left behind (ie * lbp_payload_start < l2ad_hand) because l2arc_write_buffers() * will never write more than l2arc_evict() evicts. */ if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { break; } else { vdev_space_update(vd, -asize, 0, 0); ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); } } for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); ASSERT(!HDR_EMPTY(hdr)); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto retry; } /* * A header can't be on this list if it doesn't have L2 header. */ ASSERT(HDR_HAS_L2HDR(hdr)); /* Ensure this header has finished being written. */ ASSERT(!HDR_L2_WRITING(hdr)); ASSERT(!HDR_L2_WRITE_HEAD(hdr)); if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict || hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. */ mutex_exit(hash_lock); break; } if (!HDR_HAS_L1HDR(hdr)) { ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_lsize. */ arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } else { ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); ARCSTAT_BUMP(arcstat_l2_evict_l1cached); /* * Invalidate issued or about to be issued * reads, since we may be about to write * over this location. */ if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); } arc_hdr_l2hdr_destroy(hdr); } mutex_exit(hash_lock); } mutex_exit(&dev->l2ad_mtx); out: /* * We need to check if we evict all buffers, otherwise we may iterate * unnecessarily. */ if (!all && rerun) { /* * Bump device hand to the device start if it is approaching the * end. l2arc_evict() has already evicted ahead for this case. */ dev->l2ad_hand = dev->l2ad_start; dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; goto top; } ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); if (!dev->l2ad_first) ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); } /* * Handle any abd transforms that might be required for writing to the L2ARC. * If successful, this function will always return an abd with the data * transformed as it is on disk in a new abd of asize bytes. */ static int l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, abd_t **abd_out) { int ret; void *tmp = NULL; abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd; enum zio_compress compress = HDR_GET_COMPRESS(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t size = arc_hdr_size(hdr); boolean_t ismd = HDR_ISTYPE_METADATA(hdr); boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); dsl_crypto_key_t *dck = NULL; uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 }; boolean_t no_crypt = B_FALSE; ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) || HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize); ASSERT3U(psize, <=, asize); /* * If this data simply needs its own buffer, we simply allocate it * and copy the data. This may be done to eliminate a dependency on a * shared buffer or to reallocate the buffer to match asize. */ if (HDR_HAS_RABD(hdr) && asize != psize) { ASSERT3U(asize, >=, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize); if (psize != asize) abd_zero_off(to_write, psize, asize - psize); goto out; } if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) && !HDR_ENCRYPTED(hdr)) { ASSERT3U(size, ==, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); if (size != asize) abd_zero_off(to_write, size, asize - size); goto out; } if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { cabd = abd_alloc_for_io(asize, ismd); tmp = abd_borrow_buf(cabd, asize); psize = zio_compress_data(compress, to_write, tmp, size, hdr->b_complevel); if (psize >= size) { abd_return_buf(cabd, tmp, asize); HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); to_write = cabd; abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); if (size != asize) abd_zero_off(to_write, size, asize - size); goto encrypt; } ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); if (psize < asize) bzero((char *)tmp + psize, asize - psize); psize = HDR_GET_PSIZE(hdr); abd_return_buf_copy(cabd, tmp, asize); to_write = cabd; } encrypt: if (HDR_ENCRYPTED(hdr)) { eabd = abd_alloc_for_io(asize, ismd); /* * If the dataset was disowned before the buffer * made it to this point, the key to re-encrypt * it won't be available. In this case we simply * won't write the buffer to the L2ARC. */ ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj, FTAG, &dck); if (ret != 0) goto error; ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key, hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd, &no_crypt); if (ret != 0) goto error; if (no_crypt) abd_copy(eabd, to_write, psize); if (psize != asize) abd_zero_off(eabd, psize, asize - psize); /* assert that the MAC we got here matches the one we saved */ ASSERT0(bcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN)); spa_keystore_dsl_key_rele(spa, dck, FTAG); if (to_write == cabd) abd_free(cabd); to_write = eabd; } out: ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd); *abd_out = to_write; return (0); error: if (dck != NULL) spa_keystore_dsl_key_rele(spa, dck, FTAG); if (cabd != NULL) abd_free(cabd); if (eabd != NULL) abd_free(eabd); *abd_out = NULL; return (ret); } static void l2arc_blk_fetch_done(zio_t *zio) { l2arc_read_callback_t *cb; cb = zio->io_private; if (cb->l2rcb_abd != NULL) abd_put(cb->l2rcb_abd); kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * Find and write ARC buffers to the L2ARC device. * * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. * The headroom_boost is an in-out parameter used to maintain headroom boost * state between calls to this function. * * Returns the number of bytes actually written (which may be smaller than * the delta by which the device hand has changed due to alignment and the * writing of log blocks). */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *hdr, *hdr_prev, *head; uint64_t write_asize, write_psize, write_lsize, headroom; boolean_t full; l2arc_write_callback_t *cb = NULL; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; write_lsize = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); /* * Copy buffers for L2ARC writing. */ for (int try = 0; try < L2ARC_FEED_TYPES; try++) { multilist_sublist_t *mls = l2arc_sublist_lock(try); uint64_t passed_sz = 0; VERIFY3P(mls, !=, NULL); /* * L2ARC fast warmup. * * Until the ARC is warm and starts to evict, read from the * head of the ARC lists rather than the tail. */ if (arc_warm == B_FALSE) hdr = multilist_sublist_head(mls); else hdr = multilist_sublist_tail(mls); headroom = target_sz * l2arc_headroom; if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; abd_t *to_write = NULL; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); else hdr_prev = multilist_sublist_prev(mls, hdr); hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * Skip this buffer rather than waiting. */ continue; } passed_sz += HDR_GET_LSIZE(hdr); if (l2arc_headroom != 0 && passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock); break; } if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); continue; } /* * We rely on the L1 portion of the header below, so * it's invalid for this header to have been evicted out * of the ghost cache, prior to being written out. The * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); ASSERT3U(arc_hdr_size(hdr), >, 0); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); if ((write_asize + asize) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; } /* * We rely on the L1 portion of the header below, so * it's invalid for this header to have been evicted out * of the ghost cache, prior to being written out. The * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT3U(arc_hdr_size(hdr), >, 0); /* * If this header has b_rabd, we can use this since it * must always match the data exactly as it exists on * disk. Otherwise, the L2ARC can normally use the * hdr's data, but if we're sharing data between the * hdr and one of its bufs, L2ARC needs its own copy of * the data so that the ZIO below can't race with the * buf consumer. To ensure that this copy will be * available for the lifetime of the ZIO and be cleaned * up afterwards, we add it to the l2arc_free_on_write * queue. If we need to apply any transforms to the * data (compression, encryption) we will also need the * extra buffer. */ if (HDR_HAS_RABD(hdr) && psize == asize) { to_write = hdr->b_crypt_hdr.b_rabd; } else if ((HDR_COMPRESSION_ENABLED(hdr) || HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) && !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) && psize == asize) { to_write = hdr->b_l1hdr.b_pabd; } else { int ret; arc_buf_contents_t type = arc_buf_type(hdr); ret = l2arc_apply_transforms(spa, hdr, asize, &to_write); if (ret != 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); continue; } l2arc_free_abd_on_write(to_write, asize, type); } if (pio == NULL) { /* * Insert a dummy header on the buflist so * l2arc_write_done() can find where the * write buffers begin without searching. */ mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, head); mutex_exit(&dev->l2ad_mtx); cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; /* * Create a list to save allocated abd buffers * for l2arc_log_blk_commit(). */ list_create(&cb->l2wcb_abd_list, sizeof (l2arc_lb_abd_buf_t), offsetof(l2arc_lb_abd_buf_t, node)); pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); } hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_hits = 0; hdr->b_l2hdr.b_daddr = dev->l2ad_hand; arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); wzio = zio_write_phys(pio, dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, asize, to_write, ZIO_CHECKSUM_OFF, NULL, hdr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); write_lsize += HDR_GET_LSIZE(hdr); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; vdev_space_update(dev->l2ad_vdev, asize, 0, 0); mutex_exit(hash_lock); /* * Append buf info to current log and commit if full. * arcstat_l2_{size,asize} kstats are updated * internally. */ if (l2arc_log_blk_insert(dev, hdr)) l2arc_log_blk_commit(dev, pio, cb); zio_nowait(wzio); } multilist_sublist_unlock(mls); if (full == B_TRUE) break; } /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_lsize); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); /* * Although we did not write any buffers l2ad_evict may * have advanced. */ l2arc_dev_hdr_update(dev); return (0); } if (!dev->l2ad_first) ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); ARCSTAT_INCR(arcstat_l2_psize, write_psize); dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; /* * Update the device header after the zio completes as * l2arc_write_done() may have updated the memory holding the log block * pointers in the device header. */ l2arc_dev_hdr_update(dev); return (write_asize); } static boolean_t l2arc_hdr_limit_reached(void) { int64_t s = aggsum_upper_bound(&astat_l2_hdr_size); return (arc_reclaim_needed() || (s > arc_meta_limit * 3 / 4) || (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100)); } /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. */ /* ARGSUSED */ static void l2arc_feed_thread(void *unused) { callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); fstrans_cookie_t cookie; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&l2arc_feed_thr_lock); cookie = spl_fstrans_mark(); while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_sig(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, next); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; /* * Quick check for L2ARC devices. */ mutex_enter(&l2arc_dev_mtx); if (l2arc_ndev == 0) { mutex_exit(&l2arc_dev_mtx); continue; } mutex_exit(&l2arc_dev_mtx); begin = ddi_get_lbolt(); /* * This selects the next l2arc device to write to, and in * doing so the next spa to feed from: dev->l2ad_spa. This * will return NULL if there are now no l2arc devices or if * they are all faulted. * * If a device is returned, its spa's config lock is also * held to prevent device removal. l2arc_dev_get_next() * will grab and release l2arc_dev_mtx. */ if ((dev = l2arc_dev_get_next()) == NULL) continue; spa = dev->l2ad_spa; ASSERT3P(spa, !=, NULL); /* * If the pool is read-only then force the feed thread to * sleep a little longer. */ if (!spa_writeable(spa)) { next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; spa_config_exit(spa, SCL_L2ARC, dev); continue; } /* * Avoid contributing to memory pressure. */ if (l2arc_hdr_limit_reached()) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; } ARCSTAT_BUMP(arcstat_l2_feeds); size = l2arc_write_size(dev); /* * Evict L2ARC buffers that will be overwritten. */ l2arc_evict(dev, size, B_FALSE); /* * Write ARC buffers. */ wrote = l2arc_write_buffers(spa, dev, size); /* * Calculate interval between writes. */ next = l2arc_write_interval(begin, size, wrote); spa_config_exit(spa, SCL_L2ARC, dev); } spl_fstrans_unmark(cookie); l2arc_thread_exit = 0; cv_broadcast(&l2arc_feed_thr_cv); CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ thread_exit(); } boolean_t l2arc_vdev_present(vdev_t *vd) { return (l2arc_vdev_get(vd) != NULL); } /* * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if * the vdev_t isn't an L2ARC device. */ l2arc_dev_t * l2arc_vdev_get(vdev_t *vd) { l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; dev = list_next(l2arc_dev_list, dev)) { if (dev->l2ad_vdev == vd) break; } mutex_exit(&l2arc_dev_mtx); return (dev); } /* * Add a vdev for use by the L2ARC. By this point the spa has already * validated the vdev and opened it. */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; uint64_t l2dhdr_asize; ASSERT(!l2arc_vdev_present(vd)); vdev_ashift_optimize(vd); /* * Create a new l2arc device entry. */ adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; /* leave extra size for an l2arc device header */ l2dhdr_asize = adddev->l2ad_dev_hdr_asize = MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift); adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; adddev->l2ad_trim_all = B_FALSE; list_link_init(&adddev->l2ad_node); adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP); mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* * This is a list of all ARC buffers that are still valid on the * device. */ list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); /* * This is a list of pointers to log blocks that are still present * on the device. */ list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t), offsetof(l2arc_lb_ptr_buf_t, node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); zfs_refcount_create(&adddev->l2ad_alloc); zfs_refcount_create(&adddev->l2ad_lb_asize); zfs_refcount_create(&adddev->l2ad_lb_count); /* * Add device to global list */ mutex_enter(&l2arc_dev_mtx); list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); /* * Decide if vdev is eligible for L2ARC rebuild */ l2arc_rebuild_vdev(adddev->l2ad_vdev, B_FALSE); } void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) { l2arc_dev_t *dev = NULL; l2arc_dev_hdr_phys_t *l2dhdr; uint64_t l2dhdr_asize; spa_t *spa; int err; boolean_t l2dhdr_valid = B_TRUE; dev = l2arc_vdev_get(vd); ASSERT3P(dev, !=, NULL); spa = dev->l2ad_spa; l2dhdr = dev->l2ad_dev_hdr; l2dhdr_asize = dev->l2ad_dev_hdr_asize; /* * The L2ARC has to hold at least the payload of one log block for * them to be restored (persistent L2ARC). The payload of a log block * depends on the amount of its log entries. We always write log blocks * with 1022 entries. How many of them are committed or restored depends * on the size of the L2ARC device. Thus the maximum payload of * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device * is less than that, we reduce the amount of committed and restored * log entries per block so as to enable persistence. */ if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) { dev->l2ad_log_entries = 0; } else { dev->l2ad_log_entries = MIN((dev->l2ad_end - dev->l2ad_start) >> SPA_MAXBLOCKSHIFT, L2ARC_LOG_BLK_MAX_ENTRIES); } /* * Read the device header, if an error is returned do not rebuild L2ARC. */ if ((err = l2arc_dev_hdr_read(dev)) != 0) l2dhdr_valid = B_FALSE; if (l2dhdr_valid && dev->l2ad_log_entries > 0) { /* * If we are onlining a cache device (vdev_reopen) that was * still present (l2arc_vdev_present()) and rebuild is enabled, * we should evict all ARC buffers and pointers to log blocks * and reclaim their space before restoring its contents to * L2ARC. */ if (reopen) { if (!l2arc_rebuild_enabled) { return; } else { l2arc_evict(dev, 0, B_TRUE); /* start a new log block */ dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; } } /* * Just mark the device as pending for a rebuild. We won't * be starting a rebuild in line here as it would block pool * import. Instead spa_load_impl will hand that off to an * async task which will call l2arc_spa_rebuild_start. */ dev->l2ad_rebuild = B_TRUE; } else if (spa_writeable(spa)) { /* * In this case TRIM the whole device if l2arc_trim_ahead > 0, * otherwise create a new header. We zero out the memory holding * the header to reset dh_start_lbps. If we TRIM the whole * device the new header will be written by * vdev_trim_l2arc_thread() at the end of the TRIM to update the * trim_state in the header too. When reading the header, if * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0 * we opt to TRIM the whole device again. */ if (l2arc_trim_ahead > 0) { dev->l2ad_trim_all = B_TRUE; } else { bzero(l2dhdr, l2dhdr_asize); l2arc_dev_hdr_update(dev); } } } /* * Remove a vdev from the L2ARC. */ void l2arc_remove_vdev(vdev_t *vd) { l2arc_dev_t *remdev = NULL; /* * Find the device by vdev */ remdev = l2arc_vdev_get(vd); ASSERT3P(remdev, !=, NULL); /* * Cancel any ongoing or scheduled rebuild. */ mutex_enter(&l2arc_rebuild_thr_lock); if (remdev->l2ad_rebuild_began == B_TRUE) { remdev->l2ad_rebuild_cancel = B_TRUE; while (remdev->l2ad_rebuild == B_TRUE) cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); } mutex_exit(&l2arc_rebuild_thr_lock); /* * Remove device from global list */ mutex_enter(&l2arc_dev_mtx); list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); /* * Clear all buflists and ARC references. L2ARC device flush. */ l2arc_evict(remdev, 0, B_TRUE); list_destroy(&remdev->l2ad_buflist); ASSERT(list_is_empty(&remdev->l2ad_lbptr_list)); list_destroy(&remdev->l2ad_lbptr_list); mutex_destroy(&remdev->l2ad_mtx); zfs_refcount_destroy(&remdev->l2ad_alloc); zfs_refcount_destroy(&remdev->l2ad_lb_asize); zfs_refcount_destroy(&remdev->l2ad_lb_count); kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); vmem_free(remdev, sizeof (l2arc_dev_t)); } void l2arc_init(void) { l2arc_thread_exit = 0; l2arc_ndev = 0; l2arc_writes_sent = 0; l2arc_writes_done = 0; mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; l2arc_free_on_write = &L2ARC_free_on_write; list_create(l2arc_dev_list, sizeof (l2arc_dev_t), offsetof(l2arc_dev_t, l2ad_node)); list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), offsetof(l2arc_data_free_t, l2df_list_node)); } void l2arc_fini(void) { mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_rebuild_thr_lock); cv_destroy(&l2arc_rebuild_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list); list_destroy(l2arc_free_on_write); } void l2arc_start(void) { if (!(spa_mode_global & SPA_MODE_WRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, TS_RUN, defclsyspri); } void l2arc_stop(void) { if (!(spa_mode_global & SPA_MODE_WRITE)) return; mutex_enter(&l2arc_feed_thr_lock); cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ l2arc_thread_exit = 1; while (l2arc_thread_exit != 0) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } /* * Punches out rebuild threads for the L2ARC devices in a spa. This should * be called after pool import from the spa async thread, since starting * these threads directly from spa_import() will make them part of the * "zpool import" context and delay process exit (and thus pool import). */ void l2arc_spa_rebuild_start(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * Locate the spa's l2arc devices and kick off rebuild threads. */ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { l2arc_dev_t *dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); if (dev == NULL) { /* Don't attempt a rebuild if the vdev is UNAVAIL */ continue; } mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) { dev->l2ad_rebuild_began = B_TRUE; (void) thread_create(NULL, 0, l2arc_dev_rebuild_thread, dev, 0, &p0, TS_RUN, minclsyspri); } mutex_exit(&l2arc_rebuild_thr_lock); } } /* * Main entry point for L2ARC rebuilding. */ static void l2arc_dev_rebuild_thread(void *arg) { l2arc_dev_t *dev = arg; VERIFY(!dev->l2ad_rebuild_cancel); VERIFY(dev->l2ad_rebuild); (void) l2arc_rebuild(dev); mutex_enter(&l2arc_rebuild_thr_lock); dev->l2ad_rebuild_began = B_FALSE; dev->l2ad_rebuild = B_FALSE; mutex_exit(&l2arc_rebuild_thr_lock); thread_exit(); } /* * This function implements the actual L2ARC metadata rebuild. It: * starts reading the log block chain and restores each block's contents * to memory (reconstructing arc_buf_hdr_t's). * * Operation stops under any of the following conditions: * * 1) We reach the end of the log block chain. * 2) We encounter *any* error condition (cksum errors, io errors) */ static int l2arc_rebuild(l2arc_dev_t *dev) { vdev_t *vd = dev->l2ad_vdev; spa_t *spa = vd->vdev_spa; int err = 0; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; l2arc_log_blk_phys_t *this_lb, *next_lb; zio_t *this_io = NULL, *next_io = NULL; l2arc_log_blkptr_t lbps[2]; l2arc_lb_ptr_buf_t *lb_ptr_buf; boolean_t lock_held; this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP); next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP); /* * We prevent device removal while issuing reads to the device, * then during the rebuilding phases we drop this lock again so * that a spa_unload or device remove can be initiated - this is * safe, because the spa will signal us to stop before removing * our device and wait for us to stop. */ spa_config_enter(spa, SCL_L2ARC, vd, RW_READER); lock_held = B_TRUE; /* * Retrieve the persistent L2ARC device state. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start); dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr + L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop), dev->l2ad_start); dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time; vd->vdev_trim_state = l2dhdr->dh_trim_state; /* * In case the zfs module parameter l2arc_rebuild_enabled is false * we do not start the rebuild process. */ if (!l2arc_rebuild_enabled) goto out; /* Prepare the rebuild process */ bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps)); /* Start the rebuild process */ for (;;) { if (!l2arc_log_blkptr_valid(dev, &lbps[0])) break; if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1], this_lb, next_lb, this_io, &next_io)) != 0) goto out; /* * Our memory pressure valve. If the system is running low * on memory, rather than swamping memory with new ARC buf * hdrs, we opt not to rebuild the L2ARC. At this point, * however, we have already set up our L2ARC dev to chain in * new metadata log blocks, so the user may choose to offline/ * online the L2ARC dev at a later time (or re-import the pool) * to reconstruct it (when there's less memory pressure). */ if (l2arc_hdr_limit_reached()) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem); cmn_err(CE_NOTE, "System running low on memory, " "aborting L2ARC rebuild."); err = SET_ERROR(ENOMEM); goto out; } spa_config_exit(spa, SCL_L2ARC, vd); lock_held = B_FALSE; /* * Now that we know that the next_lb checks out alright, we * can start reconstruction from this log block. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); l2arc_log_blk_restore(dev, this_lb, asize, lbps[0].lbp_daddr); /* * log block restored, include its pointer in the list of * pointers to log blocks present in the L2ARC device. */ lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); bcopy(&lbps[0], lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf); ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_count); zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); vdev_space_update(vd, asize, 0, 0); /* * Protection against loops of log blocks: * * l2ad_hand l2ad_evict * V V * l2ad_start |=======================================| l2ad_end * -----|||----|||---|||----||| * (3) (2) (1) (0) * ---|||---|||----|||---||| * (7) (6) (5) (4) * * In this situation the pointer of log block (4) passes * l2arc_log_blkptr_valid() but the log block should not be * restored as it is overwritten by the payload of log block * (0). Only log blocks (0)-(3) should be restored. We check * whether l2ad_evict lies in between the payload starting * offset of the next log block (lbps[1].lbp_payload_start) * and the payload starting offset of the present log block * (lbps[0].lbp_payload_start). If true and this isn't the * first pass, we are looping from the beginning and we should * stop. */ if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, lbps[0].lbp_payload_start, dev->l2ad_evict) && !dev->l2ad_first) goto out; for (;;) { mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild_cancel) { dev->l2ad_rebuild = B_FALSE; cv_signal(&l2arc_rebuild_thr_cv); mutex_exit(&l2arc_rebuild_thr_lock); err = SET_ERROR(ECANCELED); goto out; } mutex_exit(&l2arc_rebuild_thr_lock); if (spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) { lock_held = B_TRUE; break; } /* * L2ARC config lock held by somebody in writer, * possibly due to them trying to remove us. They'll * likely to want us to shut down, so after a little * delay, we check l2ad_rebuild_cancel and retry * the lock again. */ delay(1); } /* * Continue with the next log block. */ lbps[0] = lbps[1]; lbps[1] = this_lb->lb_prev_lbp; PTR_SWAP(this_lb, next_lb); this_io = next_io; next_io = NULL; } if (this_io != NULL) l2arc_log_blk_fetch_abort(this_io); out: if (next_io != NULL) l2arc_log_blk_fetch_abort(next_io); vmem_free(this_lb, sizeof (*this_lb)); vmem_free(next_lb, sizeof (*next_lb)); if (!l2arc_rebuild_enabled) { spa_history_log_internal(spa, "L2ARC rebuild", NULL, "disabled"); } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_success); spa_history_log_internal(spa, "L2ARC rebuild", NULL, "successful, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) { /* * No error but also nothing restored, meaning the lbps array * in the device header points to invalid/non-present log * blocks. Reset the header. */ spa_history_log_internal(spa, "L2ARC rebuild", NULL, "no valid log blocks"); bzero(l2dhdr, dev->l2ad_dev_hdr_asize); l2arc_dev_hdr_update(dev); } else if (err == ECANCELED) { /* * In case the rebuild was canceled do not log to spa history * log as the pool may be in the process of being removed. */ zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks", zfs_refcount_count(&dev->l2ad_lb_count)); } else if (err != 0) { spa_history_log_internal(spa, "L2ARC rebuild", NULL, "aborted, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } if (lock_held) spa_config_exit(spa, SCL_L2ARC, vd); return (err); } /* * Attempts to read the device header on the provided L2ARC device and writes * it to `hdr'. On success, this function returns 0, otherwise the appropriate * error code is returned. */ static int l2arc_dev_hdr_read(l2arc_dev_t *dev) { int err; uint64_t guid; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; abd_t *abd; guid = spa_guid(dev->l2ad_vdev->vdev_spa); abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SPECULATIVE, B_FALSE)); abd_put(abd); if (err != 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors); zfs_dbgmsg("L2ARC IO error (%d) while reading device header, " "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid); return (err); } if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr)); if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC || l2dhdr->dh_spa_guid != guid || l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid || l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION || l2dhdr->dh_log_entries != dev->l2ad_log_entries || l2dhdr->dh_end != dev->l2ad_end || !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, l2dhdr->dh_evict) || (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE && l2arc_trim_ahead > 0)) { /* * Attempt to rebuild a device containing no actual dev hdr * or containing a header from some other pool or from another * version of persistent L2ARC. */ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported); return (SET_ERROR(ENOTSUP)); } return (0); } /* * Reads L2ARC log blocks from storage and validates their contents. * * This function implements a simple fetcher to make sure that while * we're processing one buffer the L2ARC is already fetching the next * one in the chain. * * The arguments this_lp and next_lp point to the current and next log block * address in the block chain. Similarly, this_lb and next_lb hold the * l2arc_log_blk_phys_t's of the current and next L2ARC blk. * * The `this_io' and `next_io' arguments are used for block fetching. * When issuing the first blk IO during rebuild, you should pass NULL for * `this_io'. This function will then issue a sync IO to read the block and * also issue an async IO to fetch the next block in the block chain. The * fetched IO is returned in `next_io'. On subsequent calls to this * function, pass the value returned in `next_io' from the previous call * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO. * Prior to the call, you should initialize your `next_io' pointer to be * NULL. If no fetch IO was issued, the pointer is left set at NULL. * * On success, this function returns 0, otherwise it returns an appropriate * error code. On error the fetching IO is aborted and cleared before * returning from this function. Therefore, if we return `success', the * caller can assume that we have taken care of cleanup of fetch IOs. */ static int l2arc_log_blk_read(l2arc_dev_t *dev, const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp, l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, zio_t *this_io, zio_t **next_io) { int err = 0; zio_cksum_t cksum; abd_t *abd = NULL; uint64_t asize; ASSERT(this_lbp != NULL && next_lbp != NULL); ASSERT(this_lb != NULL && next_lb != NULL); ASSERT(next_io != NULL && *next_io == NULL); ASSERT(l2arc_log_blkptr_valid(dev, this_lbp)); /* * Check to see if we have issued the IO for this log block in a * previous run. If not, this is the first call, so issue it now. */ if (this_io == NULL) { this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp, this_lb); } /* * Peek to see if we can start issuing the next IO immediately. */ if (l2arc_log_blkptr_valid(dev, next_lbp)) { /* * Start issuing IO for the next log block early - this * should help keep the L2ARC device busy while we * decompress and restore this log block. */ *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp, next_lb); } /* Wait for the IO to read this log block to complete */ if ((err = zio_wait(this_io)) != 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors); zfs_dbgmsg("L2ARC IO error (%d) while reading log block, " "offset: %llu, vdev guid: %llu", err, this_lbp->lbp_daddr, dev->l2ad_vdev->vdev_guid); goto cleanup; } /* * Make sure the buffer checks out. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop); fletcher_4_native(this_lb, asize, NULL, &cksum); if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors); zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, " "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu", this_lbp->lbp_daddr, dev->l2ad_vdev->vdev_guid, dev->l2ad_hand, dev->l2ad_evict); err = SET_ERROR(ECKSUM); goto cleanup; } /* Now we can take our time decoding this buffer */ switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) { case ZIO_COMPRESS_OFF: break; case ZIO_COMPRESS_LZ4: abd = abd_alloc_for_io(asize, B_TRUE); abd_copy_from_buf_off(abd, this_lb, 0, asize); if ((err = zio_decompress_data( L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) { err = SET_ERROR(EINVAL); goto cleanup; } break; default: err = SET_ERROR(EINVAL); goto cleanup; } if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) byteswap_uint64_array(this_lb, sizeof (*this_lb)); if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) { err = SET_ERROR(EINVAL); goto cleanup; } cleanup: /* Abort an in-flight fetch I/O in case of error */ if (err != 0 && *next_io != NULL) { l2arc_log_blk_fetch_abort(*next_io); *next_io = NULL; } if (abd != NULL) abd_free(abd); return (err); } /* * Restores the payload of a log block to ARC. This creates empty ARC hdr * entries which only contain an l2arc hdr, essentially restoring the * buffers to their L2ARC evicted state. This function also updates space * usage on the L2ARC vdev to make sure it tracks restored buffers. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, uint64_t lb_asize, uint64_t lb_daddr) { uint64_t size = 0, asize = 0; uint64_t log_entries = dev->l2ad_log_entries; /* * Usually arc_adapt() is called only for data, not headers, but * since we may allocate significant amount of memory here, let ARC * grow its arc_c. */ arc_adapt(log_entries * HDR_L2ONLY_SIZE, arc_l2c_only); for (int i = log_entries - 1; i >= 0; i--) { /* * Restore goes in the reverse temporal direction to preserve * correct temporal ordering of buffers in the l2ad_buflist. * l2arc_hdr_restore also does a list_insert_tail instead of * list_insert_head on the l2ad_buflist: * * LIST l2ad_buflist LIST * HEAD <------ (time) ------ TAIL * direction +-----+-----+-----+-----+-----+ direction * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild * fill +-----+-----+-----+-----+-----+ * ^ ^ * | | * | | * l2arc_feed_thread l2arc_rebuild * will place new bufs here restores bufs here * * During l2arc_rebuild() the device is not used by * l2arc_feed_thread() as dev->l2ad_rebuild is set to true. */ size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop); asize += vdev_psize_to_asize(dev->l2ad_vdev, L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop)); l2arc_hdr_restore(&lb->lb_entries[i], dev); } /* * Record rebuild stats: * size Logical size of restored buffers in the L2ARC * asize Aligned size of restored buffers in the L2ARC */ ARCSTAT_INCR(arcstat_l2_rebuild_size, size); ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize); ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries); ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize); ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize); ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks); } /* * Restores a single ARC buf hdr from a log entry. The ARC buffer is put * into a state indicating that it has been evicted to L2ARC. */ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) { arc_buf_hdr_t *hdr, *exists; kmutex_t *hash_lock; arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop); uint64_t asize; /* * Do all the allocation before grabbing any locks, this lets us * sleep if memory is full and we don't have to deal with failed * allocations. */ hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type, dev, le->le_dva, le->le_daddr, L2BLK_GET_PSIZE((le)->le_prop), le->le_birth, L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel, L2BLK_GET_PROTECTED((le)->le_prop), L2BLK_GET_PREFETCH((le)->le_prop)); asize = vdev_psize_to_asize(dev->l2ad_vdev, L2BLK_GET_PSIZE((le)->le_prop)); /* * vdev_space_update() has to be called before arc_hdr_destroy() to * avoid underflow since the latter also calls the former. */ vdev_space_update(dev->l2ad_vdev, asize, 0, 0); ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(hdr)); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, hdr); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); mutex_exit(&dev->l2ad_mtx); exists = buf_hash_insert(hdr, &hash_lock); if (exists) { /* Buffer was already cached, no need to restore it. */ arc_hdr_destroy(hdr); /* * If the buffer is already cached, check whether it has * L2ARC metadata. If not, enter them and update the flag. * This is important is case of onlining a cache device, since * we previously evicted all L2ARC metadata from ARC. */ if (!HDR_HAS_L2HDR(exists)) { arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR); exists->b_l2hdr.b_dev = dev; exists->b_l2hdr.b_daddr = le->le_daddr; mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, exists); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(exists), exists); mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(exists)); ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(exists)); } ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); } mutex_exit(hash_lock); } /* * Starts an asynchronous read IO to read a log block. This is used in log * block reconstruction to start reading the next block before we are done * decoding and reconstructing the current block, to keep the l2arc device * nice and hot with read IO to process. * The returned zio will contain a newly allocated memory buffers for the IO * data which should then be freed by the caller once the zio is no longer * needed (i.e. due to it having completed). If you wish to abort this * zio, you should do so using l2arc_log_blk_fetch_abort, which takes * care of disposing of the allocated buffers correctly. */ static zio_t * l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, l2arc_log_blk_phys_t *lb) { uint32_t asize; zio_t *pio; l2arc_read_callback_t *cb; /* L2BLK_GET_PSIZE returns aligned size for log blocks */ asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); ASSERT(asize <= sizeof (l2arc_log_blk_phys_t)); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_abd = abd_get_from_buf(lb, asize); pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); return (pio); } /* * Aborts a zio returned from l2arc_log_blk_fetch and frees the data * buffers allocated for it. */ static void l2arc_log_blk_fetch_abort(zio_t *zio) { (void) zio_wait(zio); } /* * Creates a zio to update the device header on an l2arc device. */ void l2arc_dev_hdr_update(l2arc_dev_t *dev) { l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; abd_t *abd; int err; VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER)); l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC; l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION; l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid; l2dhdr->dh_log_entries = dev->l2ad_log_entries; l2dhdr->dh_evict = dev->l2ad_evict; l2dhdr->dh_start = dev->l2ad_start; l2dhdr->dh_end = dev->l2ad_end; l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize); l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count); l2dhdr->dh_flags = 0; l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time; l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state; if (dev->l2ad_first) l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE)); abd_put(abd); if (err != 0) { zfs_dbgmsg("L2ARC IO error (%d) while writing device header, " "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid); } } /* * Commits a log block to the L2ARC device. This routine is invoked from * l2arc_write_buffers when the log block fills up. * This function allocates some memory to temporarily hold the serialized * buffer to be written. This is then released in l2arc_write_done. */ static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; uint64_t psize, asize; zio_t *wzio; l2arc_lb_abd_buf_t *abd_buf; uint8_t *tmpbuf; l2arc_lb_ptr_buf_t *lb_ptr_buf; VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries); tmpbuf = zio_buf_alloc(sizeof (*lb)); abd_buf = zio_buf_alloc(sizeof (*abd_buf)); abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb)); lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); /* link the buffer into the block chain */ lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1]; lb->lb_magic = L2ARC_LOG_BLK_MAGIC; /* * l2arc_log_blk_commit() may be called multiple times during a single * l2arc_write_buffers() call. Save the allocated abd buffers in a list * so we can free them in l2arc_write_done() later on. */ list_insert_tail(&cb->l2wcb_abd_list, abd_buf); /* try to compress the buffer */ psize = zio_compress_data(ZIO_COMPRESS_LZ4, abd_buf->abd, tmpbuf, sizeof (*lb), 0); /* a log block is never entirely zero */ ASSERT(psize != 0); asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); ASSERT(asize <= sizeof (*lb)); /* * Update the start log block pointer in the device header to point * to the log block we're about to write. */ l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0]; l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand; l2dhdr->dh_start_lbps[0].lbp_payload_asize = dev->l2ad_log_blk_payload_asize; l2dhdr->dh_start_lbps[0].lbp_payload_start = dev->l2ad_log_blk_payload_start; _NOTE(CONSTCOND) L2BLK_SET_LSIZE( (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb)); L2BLK_SET_PSIZE( (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize); L2BLK_SET_CHECKSUM( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_CHECKSUM_FLETCHER_4); if (asize < sizeof (*lb)) { /* compression succeeded */ bzero(tmpbuf + psize, asize - psize); L2BLK_SET_COMPRESS( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_COMPRESS_LZ4); } else { /* compression failed */ bcopy(lb, tmpbuf, sizeof (*lb)); L2BLK_SET_COMPRESS( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_COMPRESS_OFF); } /* checksum what we're about to write */ fletcher_4_native(tmpbuf, asize, NULL, &l2dhdr->dh_start_lbps[0].lbp_cksum); abd_put(abd_buf->abd); /* perform the write itself */ abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb)); abd_take_ownership_of_buf(abd_buf->abd, B_TRUE); wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); (void) zio_nowait(wzio); dev->l2ad_hand += asize; /* * Include the committed log block's pointer in the list of pointers * to log blocks present in the L2ARC device. */ bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf); ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_count); zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); /* bump the kstats */ ARCSTAT_INCR(arcstat_l2_write_bytes, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_writes); ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize); ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, dev->l2ad_log_blk_payload_asize / asize); /* start a new log block */ dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; } /* * Validates an L2ARC log block address to make sure that it can be read * from the provided L2ARC device. */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) { /* L2BLK_GET_PSIZE returns aligned size for log blocks */ uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); uint64_t end = lbp->lbp_daddr + asize - 1; uint64_t start = lbp->lbp_payload_start; boolean_t evicted = B_FALSE; /* * A log block is valid if all of the following conditions are true: * - it fits entirely (including its payload) between l2ad_start and * l2ad_end * - it has a valid size * - neither the log block itself nor part of its payload was evicted * by l2arc_evict(): * * l2ad_hand l2ad_evict * | | lbp_daddr * | start | | end * | | | | | * V V V V V * l2ad_start ============================================ l2ad_end * --------------------------|||| * ^ ^ * | log block * payload */ evicted = l2arc_range_check_overlap(start, end, dev->l2ad_hand) || l2arc_range_check_overlap(start, end, dev->l2ad_evict) || l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) || l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end); return (start >= dev->l2ad_start && end <= dev->l2ad_end && asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) && (!evicted || dev->l2ad_first)); } /* * Inserts ARC buffer header `hdr' into the current L2ARC log block on * the device. The buffer being inserted must be present in L2ARC. * Returns B_TRUE if the L2ARC log block is full and needs to be committed * to L2ARC, or B_FALSE if it still has room for more ARC buffers. */ static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; l2arc_log_ent_phys_t *le; if (dev->l2ad_log_entries == 0) return (B_FALSE); int index = dev->l2ad_log_ent_idx++; ASSERT3S(index, <, dev->l2ad_log_entries); ASSERT(HDR_HAS_L2HDR(hdr)); le = &lb->lb_entries[index]; bzero(le, sizeof (*le)); le->le_dva = hdr->b_dva; le->le_birth = hdr->b_birth; le->le_daddr = hdr->b_l2hdr.b_daddr; if (index == 0) dev->l2ad_log_blk_payload_start = le->le_daddr; L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr)); L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr)); L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr)); le->le_complevel = hdr->b_complevel; L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, HDR_GET_PSIZE(hdr)); return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries); } /* * Checks whether a given L2ARC device address sits in a time-sequential * range. The trick here is that the L2ARC is a rotary buffer, so we can't * just do a range comparison, we need to handle the situation in which the * range wraps around the end of the L2ARC device. Arguments: * bottom -- Lower end of the range to check (written to earlier). * top -- Upper end of the range to check (written to later). * check -- The address for which we want to determine if it sits in * between the top and bottom. * * The 3-way conditional below represents the following cases: * * bottom < top : Sequentially ordered case: * --------+-------------------+ * | (overlap here?) | * L2ARC dev V V * |---------------============--------------| * * bottom > top: Looped-around case: * --------+------------------+ * | (overlap here?) | * L2ARC dev V V * |===============---------------===========| * ^ ^ * | (or here?) | * +---------------+--------- * * top == bottom : Just a single address comparison. */ boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check) { if (bottom < top) return (bottom <= check && check <= top); else if (bottom > top) return (check <= top || bottom <= check); else return (check == top); } EXPORT_SYMBOL(arc_buf_size); EXPORT_SYMBOL(arc_write); EXPORT_SYMBOL(arc_read); EXPORT_SYMBOL(arc_buf_info); EXPORT_SYMBOL(arc_getbuf_func); EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_long, param_get_long, ZMOD_RW, "Min arc size"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_long, param_get_long, ZMOD_RW, "Max arc size"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_long, param_get_long, ZMOD_RW, "Metadata limit for arc size"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent, param_set_arc_long, param_get_long, ZMOD_RW, "Percent of arc size for arc meta limit"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_long, param_get_long, ZMOD_RW, "Min arc metadata"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW, "Meta objects to scan for prune"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_adjust_restarts, INT, ZMOD_RW, "Limit number of restarts in arc_evict_meta"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, INT, ZMOD_RW, "Meta reclaim strategy"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int, param_get_int, ZMOD_RW, "Seconds before growing arc size"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW, "Disable arc_p adapt dampener"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int, param_get_int, ZMOD_RW, "log2(fraction of arc to reclaim)"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW, "Percent of pagecache to reclaim arc to"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int, param_get_int, ZMOD_RW, "arc_c shift to calc min/max arc_p"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, INT, ZMOD_RD, "Target average block size"); ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW, "Disable compressed arc buffers"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int, param_get_int, ZMOD_RW, "Min life of prefetch block in ms"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms, param_set_arc_int, param_get_int, ZMOD_RW, "Min life of prescient prefetched block in ms"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, ULONG, ZMOD_RW, "Max write bytes per interval"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, ULONG, ZMOD_RW, "Extra write bytes during device warmup"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW, "Number of max device writes to precache"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW, "Compressed l2arc_headroom multiplier"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW, "TRIM ahead L2ARC write size multiplier"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW, "Seconds between L2ARC writing"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, ULONG, ZMOD_RW, "Min feed interval in milliseconds"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW, "Skip caching prefetched buffers"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW, "Turbo L2ARC warmup"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW, "No reads during writes"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, INT, ZMOD_RW, "Percent of ARC size allowed for L2ARC-only headers"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW, "Rebuild the L2ARC when importing a pool"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW, "Min size in bytes to write rebuild log blocks in L2ARC"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int, param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_long, param_get_long, ZMOD_RW, "System free memory target size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_long, param_get_long, ZMOD_RW, "Minimum bytes of dnodes in arc"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent, param_set_arc_long, param_get_long, ZMOD_RW, "Percent of ARC meta buffers for dnodes"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW, "Percentage of excess dnodes to try to unpin"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW, "When full, ARC allocation waits for eviction of this % of alloc size"); /* END CSTYLED */ Index: vendor-sys/openzfs/dist/module/zfs/dsl_dir.c =================================================================== --- vendor-sys/openzfs/dist/module/zfs/dsl_dir.c (revision 365339) +++ vendor-sys/openzfs/dist/module/zfs/dsl_dir.c (revision 365340) @@ -1,2403 +1,2403 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 Martin Matuska. All rights reserved. * Copyright (c) 2014 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2018, loli10K . All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" -#ifdef _KERNEL -#include -#endif /* * Filesystem and Snapshot Limits * ------------------------------ * * These limits are used to restrict the number of filesystems and/or snapshots * that can be created at a given level in the tree or below. A typical * use-case is with a delegated dataset where the administrator wants to ensure * that a user within the zone is not creating too many additional filesystems * or snapshots, even though they're not exceeding their space quota. * * The filesystem and snapshot counts are stored as extensible properties. This * capability is controlled by a feature flag and must be enabled to be used. * Once enabled, the feature is not active until the first limit is set. At * that point, future operations to create/destroy filesystems or snapshots * will validate and update the counts. * * Because the count properties will not exist before the feature is active, * the counts are updated when a limit is first set on an uninitialized * dsl_dir node in the tree (The filesystem/snapshot count on a node includes * all of the nested filesystems/snapshots. Thus, a new leaf node has a * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and * snapshot count properties on a node indicate uninitialized counts on that * node.) When first setting a limit on an uninitialized node, the code starts * at the filesystem with the new limit and descends into all sub-filesystems * to add the count properties. * * In practice this is lightweight since a limit is typically set when the * filesystem is created and thus has no children. Once valid, changing the * limit value won't require a re-traversal since the counts are already valid. * When recursively fixing the counts, if a node with a limit is encountered * during the descent, the counts are known to be valid and there is no need to * descend into that filesystem's children. The counts on filesystems above the * one with the new limit will still be uninitialized, unless a limit is * eventually set on one of those filesystems. The counts are always recursively * updated when a limit is set on a dataset, unless there is already a limit. * When a new limit value is set on a filesystem with an existing limit, it is * possible for the new limit to be less than the current count at that level * since a user who can change the limit is also allowed to exceed the limit. * * Once the feature is active, then whenever a filesystem or snapshot is * created, the code recurses up the tree, validating the new count against the * limit at each initialized level. In practice, most levels will not have a * limit set. If there is a limit at any initialized level up the tree, the * check must pass or the creation will fail. Likewise, when a filesystem or * snapshot is destroyed, the counts are recursively adjusted all the way up * the initialized nodes in the tree. Renaming a filesystem into different point * in the tree will first validate, then update the counts on each branch up to * the common ancestor. A receive will also validate the counts and then update * them. * * An exception to the above behavior is that the limit is not enforced if the * user has permission to modify the limit. This is primarily so that * recursive snapshots in the global zone always work. We want to prevent a * denial-of-service in which a lower level delegated dataset could max out its * limit and thus block recursive snapshots from being taken in the global zone. * Because of this, it is possible for the snapshot count to be over the limit * and snapshots taken in the global zone could cause a lower level dataset to * hit or exceed its limit. The administrator taking the global zone recursive * snapshot should be aware of this side-effect and behave accordingly. * For consistency, the filesystem limit is also not enforced if the user can * modify the limit. * * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check() * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by * dsl_dir_init_fs_ss_count(). */ extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd); static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); typedef struct ddulrt_arg { dsl_dir_t *ddulrta_dd; uint64_t ddlrta_txg; } ddulrt_arg_t; static void dsl_dir_evict_async(void *dbu) { dsl_dir_t *dd = dbu; int t; dsl_pool_t *dp __maybe_unused = dd->dd_pool; dd->dd_dbuf = NULL; for (t = 0; t < TXG_SIZE; t++) { ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); ASSERT(dd->dd_tempreserved[t] == 0); ASSERT(dd->dd_space_towrite[t] == 0); } if (dd->dd_parent) dsl_dir_async_rele(dd->dd_parent, dd); spa_async_close(dd->dd_pool->dp_spa, dd); if (dsl_deadlist_is_open(&dd->dd_livelist)) dsl_dir_livelist_close(dd); dsl_prop_fini(dd); cv_destroy(&dd->dd_activity_cv); mutex_destroy(&dd->dd_activity_lock); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); } int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, const char *tail, void *tag, dsl_dir_t **ddp) { dmu_buf_t *dbuf; dsl_dir_t *dd; dmu_object_info_t doi; int err; ASSERT(dsl_pool_config_held(dp)); err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); if (err != 0) return (err); dd = dmu_buf_get_user(dbuf); dmu_object_info_from_db(dbuf, &doi); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR); ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); if (dd == NULL) { dsl_dir_t *winner; dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); dd->dd_object = ddobj; dd->dd_dbuf = dbuf; dd->dd_pool = dp; mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dd->dd_activity_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dd->dd_activity_cv, NULL, CV_DEFAULT, NULL); dsl_prop_init(dd); if (dsl_dir_is_zapified(dd)) { err = zap_lookup(dp->dp_meta_objset, ddobj, DD_FIELD_CRYPTO_KEY_OBJ, sizeof (uint64_t), 1, &dd->dd_crypto_obj); if (err == 0) { /* check for on-disk format errata */ if (dsl_dir_incompatible_encryption_version( dd)) { dp->dp_spa->spa_errata = ZPOOL_ERRATA_ZOL_6845_ENCRYPTION; } } else if (err != ENOENT) { goto errout; } } dsl_dir_snap_cmtime_update(dd); if (dsl_dir_phys(dd)->dd_parent_obj) { err = dsl_dir_hold_obj(dp, dsl_dir_phys(dd)->dd_parent_obj, NULL, dd, &dd->dd_parent); if (err != 0) goto errout; if (tail) { #ifdef ZFS_DEBUG uint64_t foundobj; err = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(dd->dd_parent)-> dd_child_dir_zapobj, tail, sizeof (foundobj), 1, &foundobj); ASSERT(err || foundobj == ddobj); #endif (void) strlcpy(dd->dd_myname, tail, sizeof (dd->dd_myname)); } else { err = zap_value_search(dp->dp_meta_objset, dsl_dir_phys(dd->dd_parent)-> dd_child_dir_zapobj, ddobj, 0, dd->dd_myname); } if (err != 0) goto errout; } else { (void) strlcpy(dd->dd_myname, spa_name(dp->dp_spa), sizeof (dd->dd_myname)); } if (dsl_dir_is_clone(dd)) { dmu_buf_t *origin_bonus; dsl_dataset_phys_t *origin_phys; /* * We can't open the origin dataset, because * that would require opening this dsl_dir. * Just look at its phys directly instead. */ err = dmu_bonus_hold(dp->dp_meta_objset, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_bonus); if (err != 0) goto errout; origin_phys = origin_bonus->db_data; dd->dd_origin_txg = origin_phys->ds_creation_txg; dmu_buf_rele(origin_bonus, FTAG); if (dsl_dir_is_zapified(dd)) { uint64_t obj; err = zap_lookup(dp->dp_meta_objset, dd->dd_object, DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj); if (err == 0) dsl_dir_livelist_open(dd, obj); else if (err != ENOENT) goto errout; } } dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async, &dd->dd_dbuf); winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); if (winner != NULL) { if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); if (dsl_deadlist_is_open(&dd->dd_livelist)) dsl_dir_livelist_close(dd); dsl_prop_fini(dd); cv_destroy(&dd->dd_activity_cv); mutex_destroy(&dd->dd_activity_lock); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dd = winner; } else { spa_open_ref(dp->dp_spa, dd); } } /* * The dsl_dir_t has both open-to-close and instantiate-to-evict * holds on the spa. We need the open-to-close holds because * otherwise the spa_refcnt wouldn't change when we open a * dir which the spa also has open, so we could incorrectly * think it was OK to unload/export/destroy the pool. We need * the instantiate-to-evict hold because the dsl_dir_t has a * pointer to the dd_pool, which has a pointer to the spa_t. */ spa_open_ref(dp->dp_spa, tag); ASSERT3P(dd->dd_pool, ==, dp); ASSERT3U(dd->dd_object, ==, ddobj); ASSERT3P(dd->dd_dbuf, ==, dbuf); *ddp = dd; return (0); errout: if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); if (dsl_deadlist_is_open(&dd->dd_livelist)) dsl_dir_livelist_close(dd); dsl_prop_fini(dd); cv_destroy(&dd->dd_activity_cv); mutex_destroy(&dd->dd_activity_lock); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dmu_buf_rele(dbuf, tag); return (err); } void dsl_dir_rele(dsl_dir_t *dd, void *tag) { dprintf_dd(dd, "%s\n", ""); spa_close(dd->dd_pool->dp_spa, tag); dmu_buf_rele(dd->dd_dbuf, tag); } /* * Remove a reference to the given dsl dir that is being asynchronously * released. Async releases occur from a taskq performing eviction of * dsl datasets and dirs. This process is identical to a normal release * with the exception of using the async API for releasing the reference on * the spa. */ void dsl_dir_async_rele(dsl_dir_t *dd, void *tag) { dprintf_dd(dd, "%s\n", ""); spa_async_close(dd->dd_pool->dp_spa, tag); dmu_buf_rele(dd->dd_dbuf, tag); } /* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */ void dsl_dir_name(dsl_dir_t *dd, char *buf) { if (dd->dd_parent) { dsl_dir_name(dd->dd_parent, buf); VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); } else { buf[0] = '\0'; } if (!MUTEX_HELD(&dd->dd_lock)) { /* * recursive mutex so that we can use * dprintf_dd() with dd_lock held */ mutex_enter(&dd->dd_lock); VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); mutex_exit(&dd->dd_lock); } else { VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); } } /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */ int dsl_dir_namelen(dsl_dir_t *dd) { int result = 0; if (dd->dd_parent) { /* parent's name + 1 for the "/" */ result = dsl_dir_namelen(dd->dd_parent) + 1; } if (!MUTEX_HELD(&dd->dd_lock)) { /* see dsl_dir_name */ mutex_enter(&dd->dd_lock); result += strlen(dd->dd_myname); mutex_exit(&dd->dd_lock); } else { result += strlen(dd->dd_myname); } return (result); } static int getcomponent(const char *path, char *component, const char **nextp) { char *p; if ((path == NULL) || (path[0] == '\0')) return (SET_ERROR(ENOENT)); /* This would be a good place to reserve some namespace... */ p = strpbrk(path, "/@"); if (p && (p[1] == '/' || p[1] == '@')) { /* two separators in a row */ return (SET_ERROR(EINVAL)); } if (p == NULL || p == path) { /* * if the first thing is an @ or /, it had better be an * @ and it had better not have any more ats or slashes, * and it had better have something after the @. */ if (p != NULL && (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) return (SET_ERROR(EINVAL)); if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); (void) strlcpy(component, path, ZFS_MAX_DATASET_NAME_LEN); p = NULL; } else if (p[0] == '/') { if (p - path >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); (void) strncpy(component, path, p - path); component[p - path] = '\0'; p++; } else if (p[0] == '@') { /* * if the next separator is an @, there better not be * any more slashes. */ if (strchr(path, '/')) return (SET_ERROR(EINVAL)); if (p - path >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); (void) strncpy(component, path, p - path); component[p - path] = '\0'; } else { panic("invalid p=%p", (void *)p); } *nextp = p; return (0); } /* * Return the dsl_dir_t, and possibly the last component which couldn't * be found in *tail. The name must be in the specified dsl_pool_t. This * thread must hold the dp_config_rwlock for the pool. Returns NULL if the * path is bogus, or if tail==NULL and we couldn't parse the whole name. * (*tail)[0] == '@' means that the last component is a snapshot. */ int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) { char *buf; const char *spaname, *next, *nextnext = NULL; int err; dsl_dir_t *dd; uint64_t ddobj; buf = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); err = getcomponent(name, buf, &next); if (err != 0) goto error; /* Make sure the name is in the specified pool. */ spaname = spa_name(dp->dp_spa); if (strcmp(buf, spaname) != 0) { err = SET_ERROR(EXDEV); goto error; } ASSERT(dsl_pool_config_held(dp)); err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); if (err != 0) { goto error; } while (next != NULL) { dsl_dir_t *child_dd; err = getcomponent(next, buf, &nextnext); if (err != 0) break; ASSERT(next[0] != '\0'); if (next[0] == '@') break; dprintf("looking up %s in obj%lld\n", buf, dsl_dir_phys(dd)->dd_child_dir_zapobj); err = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj, buf, sizeof (ddobj), 1, &ddobj); if (err != 0) { if (err == ENOENT) err = 0; break; } err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd); if (err != 0) break; dsl_dir_rele(dd, tag); dd = child_dd; next = nextnext; } if (err != 0) { dsl_dir_rele(dd, tag); goto error; } /* * It's an error if there's more than one component left, or * tailp==NULL and there's any component left. */ if (next != NULL && (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { /* bad path name */ dsl_dir_rele(dd, tag); dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); err = SET_ERROR(ENOENT); } if (tailp != NULL) *tailp = next; if (err == 0) *ddp = dd; error: kmem_free(buf, ZFS_MAX_DATASET_NAME_LEN); return (err); } /* * If the counts are already initialized for this filesystem and its * descendants then do nothing, otherwise initialize the counts. * * The counts on this filesystem, and those below, may be uninitialized due to * either the use of a pre-existing pool which did not support the * filesystem/snapshot limit feature, or one in which the feature had not yet * been enabled. * * Recursively descend the filesystem tree and update the filesystem/snapshot * counts on each filesystem below, then update the cumulative count on the * current filesystem. If the filesystem already has a count set on it, * then we know that its counts, and the counts on the filesystems below it, * are already correct, so we don't have to update this filesystem. */ static void dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx) { uint64_t my_fs_cnt = 0; uint64_t my_ss_cnt = 0; dsl_pool_t *dp = dd->dd_pool; objset_t *os = dp->dp_meta_objset; zap_cursor_t *zc; zap_attribute_t *za; dsl_dataset_t *ds; ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)); ASSERT(dsl_pool_config_held(dp)); ASSERT(dmu_tx_is_syncing(tx)); dsl_dir_zapify(dd, tx); /* * If the filesystem count has already been initialized then we * don't need to recurse down any further. */ if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) return; zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* Iterate my child dirs */ for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { dsl_dir_t *chld_dd; uint64_t count; VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG, &chld_dd)); /* * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets. */ if (chld_dd->dd_myname[0] == '$') { dsl_dir_rele(chld_dd, FTAG); continue; } my_fs_cnt++; /* count this child */ dsl_dir_init_fs_ss_count(chld_dd, tx); VERIFY0(zap_lookup(os, chld_dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count)); my_fs_cnt += count; VERIFY0(zap_lookup(os, chld_dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count)); my_ss_cnt += count; dsl_dir_rele(chld_dd, FTAG); } zap_cursor_fini(zc); /* Count my snapshots (we counted children's snapshots above) */ VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds)); for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { /* Don't count temporary snapshots */ if (za->za_name[0] != '%') my_ss_cnt++; } zap_cursor_fini(zc); dsl_dataset_rele(ds, FTAG); kmem_free(zc, sizeof (zap_cursor_t)); kmem_free(za, sizeof (zap_attribute_t)); /* we're in a sync task, update counts */ dmu_buf_will_dirty(dd->dd_dbuf, tx); VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (my_fs_cnt), 1, &my_fs_cnt, tx)); VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (my_ss_cnt), 1, &my_ss_cnt, tx)); } static int dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx) { char *ddname = (char *)arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; dsl_dir_t *dd; int error; error = dsl_dataset_hold(dp, ddname, FTAG, &ds); if (error != 0) return (error); if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOTSUP)); } dd = ds->ds_dir; if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) && dsl_dir_is_zapified(dd) && zap_contains(dp->dp_meta_objset, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EALREADY)); } dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx) { char *ddname = (char *)arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; spa_t *spa; VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds)); spa = dsl_dataset_get_spa(ds); if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) { /* * Since the feature was not active and we're now setting a * limit, increment the feature-active counter so that the * feature becomes active for the first time. * * We are already in a sync task so we can update the MOS. */ spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx); } /* * Since we are now setting a non-UINT64_MAX limit on the filesystem, * we need to ensure the counts are correct. Descend down the tree from * this point and update all of the counts to be accurate. */ dsl_dir_init_fs_ss_count(ds->ds_dir, tx); dsl_dataset_rele(ds, FTAG); } /* * Make sure the feature is enabled and activate it if necessary. * Since we're setting a limit, ensure the on-disk counts are valid. * This is only called by the ioctl path when setting a limit value. * * We do not need to validate the new limit, since users who can change the * limit are also allowed to exceed the limit. */ int dsl_dir_activate_fs_ss_limit(const char *ddname) { int error; error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check, dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0, ZFS_SPACE_CHECK_RESERVED); if (error == EALREADY) error = 0; return (error); } /* * Used to determine if the filesystem_limit or snapshot_limit should be * enforced. We allow the limit to be exceeded if the user has permission to * write the property value. We pass in the creds that we got in the open * context since we will always be the GZ root in syncing context. We also have * to handle the case where we are allowed to change the limit on the current * dataset, but there may be another limit in the tree above. * * We can never modify these two properties within a non-global zone. In * addition, the other checks are modeled on zfs_secpolicy_write_perms. We * can't use that function since we are already holding the dp_config_rwlock. * In addition, we already have the dd and dealing with snapshots is simplified * in this code. */ typedef enum { ENFORCE_ALWAYS, ENFORCE_NEVER, ENFORCE_ABOVE } enforce_res_t; static enforce_res_t dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr, proc_t *proc) { enforce_res_t enforce = ENFORCE_ALWAYS; uint64_t obj; dsl_dataset_t *ds; uint64_t zoned; const char *zonedstr; ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || prop == ZFS_PROP_SNAPSHOT_LIMIT); #ifdef _KERNEL if (crgetzoneid(cr) != GLOBAL_ZONEID) return (ENFORCE_ALWAYS); /* * We are checking the saved credentials of the user process, which is * not the current process. Note that we can't use secpolicy_zfs(), * because it only works if the cred is that of the current process (on * Linux). */ if (secpolicy_zfs_proc(cr, proc) == 0) return (ENFORCE_NEVER); #endif if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0) return (ENFORCE_ALWAYS); ASSERT(dsl_pool_config_held(dd->dd_pool)); if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0) return (ENFORCE_ALWAYS); zonedstr = zfs_prop_to_name(ZFS_PROP_ZONED); if (dsl_prop_get_ds(ds, zonedstr, 8, 1, &zoned, NULL) || zoned) { /* Only root can access zoned fs's from the GZ */ enforce = ENFORCE_ALWAYS; } else { if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0) enforce = ENFORCE_ABOVE; } dsl_dataset_rele(ds, FTAG); return (enforce); } /* * Check if adding additional child filesystem(s) would exceed any filesystem * limits or adding additional snapshot(s) would exceed any snapshot limits. * The prop argument indicates which limit to check. * * Note that all filesystem limits up to the root (or the highest * initialized) filesystem or the given ancestor must be satisfied. */ int dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, dsl_dir_t *ancestor, cred_t *cr, proc_t *proc) { objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t limit, count; char *count_prop; enforce_res_t enforce; int err = 0; ASSERT(dsl_pool_config_held(dd->dd_pool)); ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || prop == ZFS_PROP_SNAPSHOT_LIMIT); /* * If we're allowed to change the limit, don't enforce the limit * e.g. this can happen if a snapshot is taken by an administrative * user in the global zone (i.e. a recursive snapshot by root). * However, we must handle the case of delegated permissions where we * are allowed to change the limit on the current dataset, but there * is another limit in the tree above. */ enforce = dsl_enforce_ds_ss_limits(dd, prop, cr, proc); if (enforce == ENFORCE_NEVER) return (0); /* * e.g. if renaming a dataset with no snapshots, count adjustment * is 0. */ if (delta == 0) return (0); if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { /* * We don't enforce the limit for temporary snapshots. This is * indicated by a NULL cred_t argument. */ if (cr == NULL) return (0); count_prop = DD_FIELD_SNAPSHOT_COUNT; } else { count_prop = DD_FIELD_FILESYSTEM_COUNT; } /* * If an ancestor has been provided, stop checking the limit once we * hit that dir. We need this during rename so that we don't overcount * the check once we recurse up to the common ancestor. */ if (ancestor == dd) return (0); /* * If we hit an uninitialized node while recursing up the tree, we can * stop since we know there is no limit here (or above). The counts are * not valid on this node and we know we won't touch this node's counts. */ if (!dsl_dir_is_zapified(dd)) return (0); err = zap_lookup(os, dd->dd_object, count_prop, sizeof (count), 1, &count); if (err == ENOENT) return (0); if (err != 0) return (err); err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL, B_FALSE); if (err != 0) return (err); /* Is there a limit which we've hit? */ if (enforce == ENFORCE_ALWAYS && (count + delta) > limit) return (SET_ERROR(EDQUOT)); if (dd->dd_parent != NULL) err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop, ancestor, cr, proc); return (err); } /* * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all * parents. When a new filesystem/snapshot is created, increment the count on * all parents, and when a filesystem/snapshot is destroyed, decrement the * count. */ void dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop, dmu_tx_t *tx) { int err; objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t count; ASSERT(dsl_pool_config_held(dd->dd_pool)); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 || strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0); /* * We don't do accounting for hidden ($FREE, $MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$' && strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0) { return; } /* * e.g. if renaming a dataset with no snapshots, count adjustment is 0 */ if (delta == 0) return; /* * If we hit an uninitialized node while recursing up the tree, we can * stop since we know the counts are not valid on this node and we * know we shouldn't touch this node's counts. An uninitialized count * on the node indicates that either the feature has not yet been * activated or there are no limits on this part of the tree. */ if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object, prop, sizeof (count), 1, &count)) == ENOENT) return; VERIFY0(err); count += delta; /* Use a signed verify to make sure we're not neg. */ VERIFY3S(count, >=, 0); VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count, tx)); /* Roll up this additional count into our ancestors */ if (dd->dd_parent != NULL) dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx); } uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, dmu_tx_t *tx) { objset_t *mos = dp->dp_meta_objset; uint64_t ddobj; dsl_dir_phys_t *ddphys; dmu_buf_t *dbuf; ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); if (pds) { VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj, name, sizeof (uint64_t), 1, &ddobj, tx)); } else { /* it's the root dir */ VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); } VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); ddphys = dbuf->db_data; ddphys->dd_creation_time = gethrestime_sec(); if (pds) { ddphys->dd_parent_obj = pds->dd_object; /* update the filesystem counts */ dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx); } ddphys->dd_props_zapobj = zap_create(mos, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); ddphys->dd_child_dir_zapobj = zap_create(mos, DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; dmu_buf_rele(dbuf, FTAG); return (ddobj); } boolean_t dsl_dir_is_clone(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_origin_obj && (dd->dd_pool->dp_origin_snap == NULL || dsl_dir_phys(dd)->dd_origin_obj != dd->dd_pool->dp_origin_snap->ds_object)); } uint64_t dsl_dir_get_used(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_used_bytes); } uint64_t dsl_dir_get_compressed(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_compressed_bytes); } uint64_t dsl_dir_get_quota(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_quota); } uint64_t dsl_dir_get_reservation(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_reserved); } uint64_t dsl_dir_get_compressratio(dsl_dir_t *dd) { /* a fixed point number, 100x the ratio */ return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 : (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 / dsl_dir_phys(dd)->dd_compressed_bytes)); } uint64_t dsl_dir_get_logicalused(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_uncompressed_bytes); } uint64_t dsl_dir_get_usedsnap(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]); } uint64_t dsl_dir_get_usedds(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]); } uint64_t dsl_dir_get_usedrefreserv(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]); } uint64_t dsl_dir_get_usedchild(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]); } void dsl_dir_get_origin(dsl_dir_t *dd, char *buf) { dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds)); dsl_dataset_name(ds, buf); dsl_dataset_rele(ds, FTAG); } int dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count) { if (dsl_dir_is_zapified(dd)) { objset_t *os = dd->dd_pool->dp_meta_objset; return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (*count), 1, count)); } else { return (SET_ERROR(ENOENT)); } } int dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count) { if (dsl_dir_is_zapified(dd)) { objset_t *os = dd->dd_pool->dp_meta_objset; return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (*count), 1, count)); } else { return (SET_ERROR(ENOENT)); } } void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) { mutex_enter(&dd->dd_lock); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dsl_dir_get_quota(dd)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, dsl_dir_get_reservation(dd)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, dsl_dir_get_logicalused(dd)); if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, dsl_dir_get_usedsnap(dd)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, dsl_dir_get_usedds(dd)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, dsl_dir_get_usedrefreserv(dd)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, dsl_dir_get_usedchild(dd)); } mutex_exit(&dd->dd_lock); uint64_t count; if (dsl_dir_get_filesystem_count(dd, &count) == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT, count); } if (dsl_dir_get_snapshot_count(dd, &count) == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT, count); } if (dsl_dir_is_clone(dd)) { char buf[ZFS_MAX_DATASET_NAME_LEN]; dsl_dir_get_origin(dd, buf); dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); } } void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) { dsl_pool_t *dp = dd->dd_pool; ASSERT(dsl_dir_phys(dd)); if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(dd->dd_dbuf, dd); } } static int64_t parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) { uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved); uint64_t new_accounted = MAX(used + delta, dsl_dir_phys(dd)->dd_reserved); return (new_accounted - old_accounted); } void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); mutex_enter(&dd->dd_lock); ASSERT0(dd->dd_tempreserved[tx->tx_txg & TXG_MASK]); dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, dd->dd_space_towrite[tx->tx_txg & TXG_MASK] / 1024); dd->dd_space_towrite[tx->tx_txg & TXG_MASK] = 0; mutex_exit(&dd->dd_lock); /* release the hold from dsl_dir_dirty */ dmu_buf_rele(dd->dd_dbuf, dd); } static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd) { uint64_t space = 0; ASSERT(MUTEX_HELD(&dd->dd_lock)); for (int i = 0; i < TXG_SIZE; i++) { space += dd->dd_space_towrite[i & TXG_MASK]; ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0); } return (space); } /* * How much space would dd have available if ancestor had delta applied * to it? If ondiskonly is set, we're only interested in what's * on-disk, not estimated pending changes. */ uint64_t dsl_dir_space_available(dsl_dir_t *dd, dsl_dir_t *ancestor, int64_t delta, int ondiskonly) { uint64_t parentspace, myspace, quota, used; /* * If there are no restrictions otherwise, assume we have * unlimited space available. */ quota = UINT64_MAX; parentspace = UINT64_MAX; if (dd->dd_parent != NULL) { parentspace = dsl_dir_space_available(dd->dd_parent, ancestor, delta, ondiskonly); } mutex_enter(&dd->dd_lock); if (dsl_dir_phys(dd)->dd_quota != 0) quota = dsl_dir_phys(dd)->dd_quota; used = dsl_dir_phys(dd)->dd_used_bytes; if (!ondiskonly) used += dsl_dir_space_towrite(dd); if (dd->dd_parent == NULL) { uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, ZFS_SPACE_CHECK_NORMAL); quota = MIN(quota, poolsize); } if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) { /* * We have some space reserved, in addition to what our * parent gave us. */ parentspace += dsl_dir_phys(dd)->dd_reserved - used; } if (dd == ancestor) { ASSERT(delta <= 0); ASSERT(used >= -delta); used += delta; if (parentspace != UINT64_MAX) parentspace -= delta; } if (used > quota) { /* over quota */ myspace = 0; } else { /* * the lesser of the space provided by our parent and * the space left in our quota */ myspace = MIN(parentspace, quota - used); } mutex_exit(&dd->dd_lock); return (myspace); } struct tempreserve { list_node_t tr_node; dsl_dir_t *tr_ds; uint64_t tr_size; }; static int dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, boolean_t ignorequota, list_t *tr_list, dmu_tx_t *tx, boolean_t first) { uint64_t txg; uint64_t quota; struct tempreserve *tr; int retval; uint64_t ref_rsrv; top_of_function: txg = tx->tx_txg; retval = EDQUOT; ref_rsrv = 0; ASSERT3U(txg, !=, 0); ASSERT3S(asize, >, 0); mutex_enter(&dd->dd_lock); /* * Check against the dsl_dir's quota. We don't add in the delta * when checking for over-quota because they get one free hit. */ uint64_t est_inflight = dsl_dir_space_towrite(dd); for (int i = 0; i < TXG_SIZE; i++) est_inflight += dd->dd_tempreserved[i]; uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; /* * On the first iteration, fetch the dataset's used-on-disk and * refreservation values. Also, if checkrefquota is set, test if * allocating this space would exceed the dataset's refquota. */ if (first && tx->tx_objset) { int error; dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; error = dsl_dataset_check_quota(ds, !netfree, asize, est_inflight, &used_on_disk, &ref_rsrv); if (error != 0) { mutex_exit(&dd->dd_lock); DMU_TX_STAT_BUMP(dmu_tx_quota); return (error); } } /* * If this transaction will result in a net free of space, * we want to let it through. */ if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0) quota = UINT64_MAX; else quota = dsl_dir_phys(dd)->dd_quota; /* * Adjust the quota against the actual pool size at the root * minus any outstanding deferred frees. * To ensure that it's possible to remove files from a full * pool without inducing transient overcommits, we throttle * netfree transactions against a quota that is slightly larger, * but still within the pool's allocation slop. In cases where * we're very close to full, this will allow a steady trickle of * removes to get through. */ uint64_t deferred = 0; if (dd->dd_parent == NULL) { uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool, (netfree) ? ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL); if (avail < quota) { quota = avail; retval = SET_ERROR(ENOSPC); } } /* * If they are requesting more space, and our current estimate * is over quota, they get to try again unless the actual * on-disk is over quota and there are no pending changes (which * may free up space for us). */ if (used_on_disk + est_inflight >= quota) { if (est_inflight > 0 || used_on_disk < quota || (retval == ENOSPC && used_on_disk < quota + deferred)) retval = ERESTART; dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " "quota=%lluK tr=%lluK err=%d\n", used_on_disk>>10, est_inflight>>10, quota>>10, asize>>10, retval); mutex_exit(&dd->dd_lock); DMU_TX_STAT_BUMP(dmu_tx_quota); return (SET_ERROR(retval)); } /* We need to up our estimated delta before dropping dd_lock */ dd->dd_tempreserved[txg & TXG_MASK] += asize; uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, asize - ref_rsrv); mutex_exit(&dd->dd_lock); tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); tr->tr_ds = dd; tr->tr_size = asize; list_insert_tail(tr_list, tr); /* see if it's OK with our parent */ if (dd->dd_parent != NULL && parent_rsrv != 0) { /* * Recurse on our parent without recursion. This has been * observed to be potentially large stack usage even within * the test suite. Largest seen stack was 7632 bytes on linux. */ dd = dd->dd_parent; asize = parent_rsrv; ignorequota = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); first = B_FALSE; goto top_of_function; } else { return (0); } } /* * Reserve space in this dsl_dir, to be used in this tx's txg. * After the space has been dirtied (and dsl_dir_willuse_space() * has been called), the reservation should be canceled, using * dsl_dir_tempreserve_clear(). */ int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx) { int err; list_t *tr_list; if (asize == 0) { *tr_cookiep = NULL; return (0); } tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); list_create(tr_list, sizeof (struct tempreserve), offsetof(struct tempreserve, tr_node)); ASSERT3S(asize, >, 0); err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg); if (err == 0) { struct tempreserve *tr; tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); tr->tr_size = lsize; list_insert_tail(tr_list, tr); } else { if (err == EAGAIN) { /* * If arc_memory_throttle() detected that pageout * is running and we are low on memory, we delay new * non-pageout transactions to give pageout an * advantage. * * It is unfortunate to be delaying while the caller's * locks are held. */ txg_delay(dd->dd_pool, tx->tx_txg, MSEC2NSEC(10), MSEC2NSEC(10)); err = SET_ERROR(ERESTART); } } if (err == 0) { err = dsl_dir_tempreserve_impl(dd, asize, netfree, B_FALSE, tr_list, tx, B_TRUE); } if (err != 0) dsl_dir_tempreserve_clear(tr_list, tx); else *tr_cookiep = tr_list; return (err); } /* * Clear a temporary reservation that we previously made with * dsl_dir_tempreserve_space(). */ void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) { int txgidx = tx->tx_txg & TXG_MASK; list_t *tr_list = tr_cookie; struct tempreserve *tr; ASSERT3U(tx->tx_txg, !=, 0); if (tr_cookie == NULL) return; while ((tr = list_head(tr_list)) != NULL) { if (tr->tr_ds) { mutex_enter(&tr->tr_ds->dd_lock); ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, tr->tr_size); tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; mutex_exit(&tr->tr_ds->dd_lock); } else { arc_tempreserve_clear(tr->tr_size); } list_remove(tr_list, tr); kmem_free(tr, sizeof (struct tempreserve)); } kmem_free(tr_list, sizeof (list_t)); } /* * This should be called from open context when we think we're going to write * or free space, for example when dirtying data. Be conservative; it's okay * to write less space or free more, but we don't want to write more or free * less than the amount specified. * * NOTE: The behavior of this function is identical to the Illumos / FreeBSD * version however it has been adjusted to use an iterative rather than * recursive algorithm to minimize stack usage. */ void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) { int64_t parent_space; uint64_t est_used; do { mutex_enter(&dd->dd_lock); if (space > 0) dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes; parent_space = parent_delta(dd, est_used, space); mutex_exit(&dd->dd_lock); /* Make sure that we clean up dd_space_to* */ dsl_dir_dirty(dd, tx); dd = dd->dd_parent; space = parent_space; } while (space && dd); } /* call from syncing context when we actually write/free space for this dd */ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) { int64_t accounted_delta; /* * dsl_dataset_set_refreservation_sync_impl() calls this with * dd_lock held, so that it can atomically update * ds->ds_reserved and the dsl_dir accounting, so that * dsl_dataset_check_quota() can see dataset and dir accounting * consistently. */ boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(type < DD_USED_NUM); dmu_buf_will_dirty(dd->dd_dbuf, tx); if (needlock) mutex_enter(&dd->dd_lock); accounted_delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used); ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used); ASSERT(compressed >= 0 || dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed); ASSERT(uncompressed >= 0 || dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed); dsl_dir_phys(dd)->dd_used_bytes += used; dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed; dsl_dir_phys(dd)->dd_compressed_bytes += compressed; if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { ASSERT(used > 0 || dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used); dsl_dir_phys(dd)->dd_used_breakdown[type] += used; #ifdef ZFS_DEBUG { dd_used_t t; uint64_t u = 0; for (t = 0; t < DD_USED_NUM; t++) u += dsl_dir_phys(dd)->dd_used_breakdown[t]; ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes); } #endif } if (needlock) mutex_exit(&dd->dd_lock); if (dd->dd_parent != NULL) { dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, accounted_delta, compressed, uncompressed, tx); dsl_dir_transfer_space(dd->dd_parent, used - accounted_delta, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); } } void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(oldtype < DD_USED_NUM); ASSERT(newtype < DD_USED_NUM); if (delta == 0 || !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN)) return; dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); ASSERT(delta > 0 ? dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta : dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta); ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta)); dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta; dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta; mutex_exit(&dd->dd_lock); } typedef struct dsl_dir_set_qr_arg { const char *ddsqra_name; zprop_source_t ddsqra_source; uint64_t ddsqra_value; } dsl_dir_set_qr_arg_t; static int dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; uint64_t towrite, newval; error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); if (error != 0) return (error); error = dsl_prop_predict(ds->ds_dir, "quota", ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (newval == 0) { dsl_dataset_rele(ds, FTAG); return (0); } mutex_enter(&ds->ds_dir->dd_lock); /* * If we are doing the preliminary check in open context, and * there are pending changes, then don't fail it, since the * pending changes could under-estimate the amount of space to be * freed up. */ towrite = dsl_dir_space_towrite(ds->ds_dir); if ((dmu_tx_is_syncing(tx) || towrite == 0) && (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved || newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) { error = SET_ERROR(ENOSPC); } mutex_exit(&ds->ds_dir->dd_lock); dsl_dataset_rele(ds, FTAG); return (error); } static void dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; uint64_t newval; VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, &ddsqra->ddsqra_value, tx); VERIFY0(dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); } else { newval = ddsqra->ddsqra_value; spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval); } dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); dsl_dir_phys(ds->ds_dir)->dd_quota = newval; mutex_exit(&ds->ds_dir->dd_lock); dsl_dataset_rele(ds, FTAG); } int dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) { dsl_dir_set_qr_arg_t ddsqra; ddsqra.ddsqra_name = ddname; ddsqra.ddsqra_source = source; ddsqra.ddsqra_value = quota; return (dsl_sync_task(ddname, dsl_dir_set_quota_check, dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } static int dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; dsl_dir_t *dd; uint64_t newval, used, avail; int error; error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); if (error != 0) return (error); dd = ds->ds_dir; /* * If we are doing the preliminary check in open context, the * space estimates may be inaccurate. */ if (!dmu_tx_is_syncing(tx)) { dsl_dataset_rele(ds, FTAG); return (0); } error = dsl_prop_predict(ds->ds_dir, zfs_prop_to_name(ZFS_PROP_RESERVATION), ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } mutex_enter(&dd->dd_lock); used = dsl_dir_phys(dd)->dd_used_bytes; mutex_exit(&dd->dd_lock); if (dd->dd_parent) { avail = dsl_dir_space_available(dd->dd_parent, NULL, 0, FALSE); } else { avail = dsl_pool_adjustedsize(dd->dd_pool, ZFS_SPACE_CHECK_NORMAL) - used; } if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) { uint64_t delta = MAX(used, newval) - MAX(used, dsl_dir_phys(dd)->dd_reserved); if (delta > avail || (dsl_dir_phys(dd)->dd_quota > 0 && newval > dsl_dir_phys(dd)->dd_quota)) error = SET_ERROR(ENOSPC); } dsl_dataset_rele(ds, FTAG); return (error); } void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) { uint64_t used; int64_t delta; dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); used = dsl_dir_phys(dd)->dd_used_bytes; delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved); dsl_dir_phys(dd)->dd_reserved = value; if (dd->dd_parent != NULL) { /* Roll up this additional usage into our ancestors */ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, delta, 0, 0, tx); } mutex_exit(&dd->dd_lock); } static void dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; uint64_t newval; VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_RESERVATION), ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, &ddsqra->ddsqra_value, tx); VERIFY0(dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); } else { newval = ddsqra->ddsqra_value; spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", zfs_prop_to_name(ZFS_PROP_RESERVATION), (longlong_t)newval); } dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); dsl_dataset_rele(ds, FTAG); } int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, uint64_t reservation) { dsl_dir_set_qr_arg_t ddsqra; ddsqra.ddsqra_name = ddname; ddsqra.ddsqra_source = source; ddsqra.ddsqra_value = reservation; return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } static dsl_dir_t * closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) { for (; ds1; ds1 = ds1->dd_parent) { dsl_dir_t *dd; for (dd = ds2; dd; dd = dd->dd_parent) { if (ds1 == dd) return (dd); } } return (NULL); } /* * If delta is applied to dd, how much of that delta would be applied to * ancestor? Syncing context only. */ static int64_t would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) { if (dd == ancestor) return (delta); mutex_enter(&dd->dd_lock); delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta); mutex_exit(&dd->dd_lock); return (would_change(dd->dd_parent, delta, ancestor)); } typedef struct dsl_dir_rename_arg { const char *ddra_oldname; const char *ddra_newname; cred_t *ddra_cred; proc_t *ddra_proc; } dsl_dir_rename_arg_t; typedef struct dsl_valid_rename_arg { int char_delta; int nest_delta; } dsl_valid_rename_arg_t; /* ARGSUSED */ static int dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { dsl_valid_rename_arg_t *dvra = arg; char namebuf[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds, namebuf); ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); int namelen = strlen(namebuf) + dvra->char_delta; int depth = get_dataset_depth(namebuf) + dvra->nest_delta; if (namelen >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int dsl_dir_rename_check(void *arg, dmu_tx_t *tx) { dsl_dir_rename_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd, *newparent; dsl_valid_rename_arg_t dvra; dsl_dataset_t *parentds; objset_t *parentos; const char *mynewname; int error; /* target dir should exist */ error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); if (error != 0) return (error); /* new parent should exist */ error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, &mynewname); if (error != 0) { dsl_dir_rele(dd, FTAG); return (error); } /* can't rename to different pool */ if (dd->dd_pool != newparent->dd_pool) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EXDEV)); } /* new name should not already exist */ if (mynewname == NULL) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EEXIST)); } /* can't rename below anything but filesystems (eg. no ZVOLs) */ error = dsl_dataset_hold_obj(newparent->dd_pool, dsl_dir_phys(newparent)->dd_head_dataset_obj, FTAG, &parentds); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (error); } error = dmu_objset_from_ds(parentds, &parentos); if (error != 0) { dsl_dataset_rele(parentds, FTAG); dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (error); } if (dmu_objset_type(parentos) != DMU_OST_ZFS) { dsl_dataset_rele(parentds, FTAG); dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); } dsl_dataset_rele(parentds, FTAG); ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); dvra.char_delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname); dvra.nest_delta = get_dataset_depth(ddra->ddra_newname) - get_dataset_depth(ddra->ddra_oldname); /* if the name length is growing, validate child name lengths */ if (dvra.char_delta > 0 || dvra.nest_delta > 0) { error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (error); } } if (dmu_tx_is_syncing(tx)) { if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { /* * Although this is the check function and we don't * normally make on-disk changes in check functions, * we need to do that here. * * Ensure this portion of the tree's counts have been * initialized in case the new parent has limits set. */ dsl_dir_init_fs_ss_count(dd, tx); } } if (newparent != dd->dd_parent) { /* is there enough space? */ uint64_t myspace = MAX(dsl_dir_phys(dd)->dd_used_bytes, dsl_dir_phys(dd)->dd_reserved); objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t fs_cnt = 0; uint64_t ss_cnt = 0; if (dsl_dir_is_zapified(dd)) { int err; err = zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, &fs_cnt); if (err != ENOENT && err != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (err); } /* * have to add 1 for the filesystem itself that we're * moving */ fs_cnt++; err = zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, &ss_cnt); if (err != ENOENT && err != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (err); } } /* check for encryption errors */ error = dsl_dir_rename_crypt_check(dd, newparent); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EACCES)); } /* no rename into our descendant */ if (closest_common_ancestor(dd, newparent) == dd) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_dir_transfer_possible(dd->dd_parent, newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred, ddra->ddra_proc); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (error); } } dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (0); } static void dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) { dsl_dir_rename_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd, *newparent; const char *mynewname; objset_t *mos = dp->dp_meta_objset; VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, &mynewname)); /* Log this before we change the name. */ spa_history_log_internal_dd(dd, "rename", tx, "-> %s", ddra->ddra_newname); if (newparent != dd->dd_parent) { objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t fs_cnt = 0; uint64_t ss_cnt = 0; /* * We already made sure the dd counts were initialized in the * check function. */ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { VERIFY0(zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, &fs_cnt)); /* add 1 for the filesystem itself that we're moving */ fs_cnt++; VERIFY0(zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, &ss_cnt)); } dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt, DD_FIELD_FILESYSTEM_COUNT, tx); dsl_fs_ss_count_adjust(newparent, fs_cnt, DD_FIELD_FILESYSTEM_COUNT, tx); dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt, DD_FIELD_SNAPSHOT_COUNT, tx); dsl_fs_ss_count_adjust(newparent, ss_cnt, DD_FIELD_SNAPSHOT_COUNT, tx); dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, -dsl_dir_phys(dd)->dd_used_bytes, -dsl_dir_phys(dd)->dd_compressed_bytes, -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); dsl_dir_diduse_space(newparent, DD_USED_CHILD, dsl_dir_phys(dd)->dd_used_bytes, dsl_dir_phys(dd)->dd_compressed_bytes, dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); if (dsl_dir_phys(dd)->dd_reserved > dsl_dir_phys(dd)->dd_used_bytes) { uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved - dsl_dir_phys(dd)->dd_used_bytes; dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, -unused_rsrv, 0, 0, tx); dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, unused_rsrv, 0, 0, tx); } } dmu_buf_will_dirty(dd->dd_dbuf, tx); /* remove from old parent zapobj */ VERIFY0(zap_remove(mos, dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, dd->dd_myname, tx)); (void) strlcpy(dd->dd_myname, mynewname, sizeof (dd->dd_myname)); dsl_dir_rele(dd->dd_parent, dd); dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object; VERIFY0(dsl_dir_hold_obj(dp, newparent->dd_object, NULL, dd, &dd->dd_parent)); /* add to new parent zapobj */ VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj, dd->dd_myname, 8, 1, &dd->dd_object, tx)); + /* TODO: A rename callback to avoid these layering violations. */ + zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname); zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname, ddra->ddra_newname, B_TRUE); dsl_prop_notify_all(dd); dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); } int dsl_dir_rename(const char *oldname, const char *newname) { dsl_dir_rename_arg_t ddra; ddra.ddra_oldname = oldname; ddra.ddra_newname = newname; ddra.ddra_cred = CRED(); ddra.ddra_proc = curproc; return (dsl_sync_task(oldname, dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3, ZFS_SPACE_CHECK_RESERVED)); } int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr, proc_t *proc) { dsl_dir_t *ancestor; int64_t adelta; uint64_t avail; int err; ancestor = closest_common_ancestor(sdd, tdd); adelta = would_change(sdd, -space, ancestor); avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); if (avail < space) return (SET_ERROR(ENOSPC)); err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT, ancestor, cr, proc); if (err != 0) return (err); err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT, ancestor, cr, proc); if (err != 0) return (err); return (0); } inode_timespec_t dsl_dir_snap_cmtime(dsl_dir_t *dd) { inode_timespec_t t; mutex_enter(&dd->dd_lock); t = dd->dd_snap_cmtime; mutex_exit(&dd->dd_lock); return (t); } void dsl_dir_snap_cmtime_update(dsl_dir_t *dd) { inode_timespec_t t; gethrestime(&t); mutex_enter(&dd->dd_lock); dd->dd_snap_cmtime = t; mutex_exit(&dd->dd_lock); } void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx) { objset_t *mos = dd->dd_pool->dp_meta_objset; dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx); } boolean_t dsl_dir_is_zapified(dsl_dir_t *dd) { dmu_object_info_t doi; dmu_object_info_from_db(dd->dd_dbuf, &doi); return (doi.doi_type == DMU_OTN_ZAP_METADATA); } void dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj) { objset_t *mos = dd->dd_pool->dp_meta_objset; ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa, SPA_FEATURE_LIVELIST)); dsl_deadlist_open(&dd->dd_livelist, mos, obj); bplist_create(&dd->dd_pending_allocs); bplist_create(&dd->dd_pending_frees); } void dsl_dir_livelist_close(dsl_dir_t *dd) { dsl_deadlist_close(&dd->dd_livelist); bplist_destroy(&dd->dd_pending_allocs); bplist_destroy(&dd->dd_pending_frees); } void dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total) { uint64_t obj; dsl_pool_t *dp = dmu_tx_pool(tx); spa_t *spa = dp->dp_spa; livelist_condense_entry_t to_condense = spa->spa_to_condense; if (!dsl_deadlist_is_open(&dd->dd_livelist)) return; /* * If the livelist being removed is set to be condensed, stop the * condense zthr and indicate the cancellation in the spa_to_condense * struct in case the condense no-wait synctask has already started */ zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; if (ll_condense_thread != NULL && (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) { /* * We use zthr_wait_cycle_done instead of zthr_cancel * because we don't want to destroy the zthr, just have * it skip its current task. */ spa->spa_to_condense.cancelled = B_TRUE; zthr_wait_cycle_done(ll_condense_thread); /* * If we've returned from zthr_wait_cycle_done without * clearing the to_condense data structure it's either * because the no-wait synctask has started (which is * indicated by 'syncing' field of to_condense) and we * can expect it to clear to_condense on its own. * Otherwise, we returned before the zthr ran. The * checkfunc will now fail as cancelled == B_TRUE so we * can safely NULL out ds, allowing a different dir's * livelist to be condensed. * * We can be sure that the to_condense struct will not * be repopulated at this stage because both this * function and dsl_livelist_try_condense execute in * syncing context. */ if ((spa->spa_to_condense.ds != NULL) && !spa->spa_to_condense.syncing) { dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); spa->spa_to_condense.ds = NULL; } } dsl_dir_livelist_close(dd); VERIFY0(zap_lookup(dp->dp_meta_objset, dd->dd_object, DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj)); VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object, DD_FIELD_LIVELIST, tx)); if (total) { dsl_deadlist_free(dp->dp_meta_objset, obj, tx); spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); } } static int dsl_dir_activity_in_progress(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity, boolean_t *in_progress) { int error = 0; ASSERT(MUTEX_HELD(&dd->dd_activity_lock)); switch (activity) { case ZFS_WAIT_DELETEQ: { #ifdef _KERNEL objset_t *os; error = dmu_objset_from_ds(ds, &os); if (error != 0) break; mutex_enter(&os->os_user_ptr_lock); void *user = dmu_objset_get_user(os); mutex_exit(&os->os_user_ptr_lock); if (dmu_objset_type(os) != DMU_OST_ZFS || user == NULL || zfs_get_vfs_flag_unmounted(os)) { *in_progress = B_FALSE; return (0); } uint64_t readonly = B_FALSE; error = zfs_get_temporary_prop(ds, ZFS_PROP_READONLY, &readonly, NULL); if (error != 0) break; if (readonly || !spa_writeable(dd->dd_pool->dp_spa)) { *in_progress = B_FALSE; return (0); } uint64_t count, unlinked_obj; error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &unlinked_obj); if (error != 0) { dsl_dataset_rele(ds, FTAG); break; } error = zap_count(os, unlinked_obj, &count); if (error == 0) *in_progress = (count != 0); break; #else /* * The delete queue is ZPL specific, and libzpool doesn't have * it. It doesn't make sense to wait for it. */ *in_progress = B_FALSE; break; #endif } default: panic("unrecognized value for activity %d", activity); } return (error); } int dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity, boolean_t *waited) { int error = 0; boolean_t in_progress; dsl_pool_t *dp = dd->dd_pool; for (;;) { dsl_pool_config_enter(dp, FTAG); error = dsl_dir_activity_in_progress(dd, ds, activity, &in_progress); dsl_pool_config_exit(dp, FTAG); if (error != 0 || !in_progress) break; *waited = B_TRUE; if (cv_wait_sig(&dd->dd_activity_cv, &dd->dd_activity_lock) == 0 || dd->dd_activity_cancelled) { error = SET_ERROR(EINTR); break; } } return (error); } void dsl_dir_cancel_waiters(dsl_dir_t *dd) { mutex_enter(&dd->dd_activity_lock); dd->dd_activity_cancelled = B_TRUE; cv_broadcast(&dd->dd_activity_cv); while (dd->dd_activity_waiters > 0) cv_wait(&dd->dd_activity_cv, &dd->dd_activity_lock); mutex_exit(&dd->dd_activity_lock); } #if defined(_KERNEL) EXPORT_SYMBOL(dsl_dir_set_quota); EXPORT_SYMBOL(dsl_dir_set_reservation); #endif Index: vendor-sys/openzfs/dist/module/zfs/spa.c =================================================================== --- vendor-sys/openzfs/dist/module/zfs/spa.c (revision 365339) +++ vendor-sys/openzfs/dist/module/zfs/spa.c (revision 365340) @@ -1,9755 +1,9756 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright 2018 Joyent, Inc. * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. */ /* * SPA: Storage Pool Allocator * * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a * pool. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #include #endif /* _KERNEL */ #include "zfs_prop.h" #include "zfs_comutil.h" /* * The interval, in seconds, at which failed configuration cache file writes * should be retried. */ int zfs_ccw_retry_interval = 300; typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ ZTI_MODE_NULL, /* don't create a taskq */ ZTI_NMODES } zti_modes_t; #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } #define ZTI_N(n) ZTI_P(n, 1) #define ZTI_ONE ZTI_N(1) typedef struct zio_taskq_info { zti_modes_t zti_mode; uint_t zti_value; uint_t zti_count; } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { "iss", "iss_h", "int", "int_h" }; /* * This table defines the taskq settings for each ZFS I/O type. When * initializing a pool, we use this table to create an appropriately sized * taskq. Some operations are low volume and therefore have a small, static * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE * macros. Other operations process a large amount of data; the ZTI_BATCH * macro causes us to create a taskq oriented for throughput. Some operations * are so high frequency and short-lived that the taskq itself can become a * point of lock contention. The ZTI_P(#, #) macro indicates that we need an * additional degree of parallelism specified by the number of threads per- * taskq and the number of taskqs; when dispatching an event in this case, the * particular taskq is chosen at random. * * The different taskq priorities are to handle the different contexts (issue * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that * need to be handled with minimum delay. */ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ { ZTI_BATCH, ZTI_N(5), ZTI_P(12, 8), ZTI_N(5) }, /* WRITE */ { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ }; static void spa_sync_version(void *arg, dmu_tx_t *tx); static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport); static void spa_vdev_resilver_done(spa_t *spa); uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ uint_t zio_taskq_basedc = 80; /* base duty cycle */ boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ /* * Report any spa_load_verify errors found, but do not fail spa_load. * This is used by zdb to analyze non-idle pools. */ boolean_t spa_load_verify_dryrun = B_FALSE; /* * This (illegal) pool name is used when temporarily importing a spa_t in order * to get the vdev stats associated with the imported devices. */ #define TRYIMPORT_NAME "$import" /* * For debugging purposes: print out vdev tree during pool import. */ int spa_load_print_vdev_tree = B_FALSE; /* * A non-zero value for zfs_max_missing_tvds means that we allow importing * pools with missing top-level vdevs. This is strictly intended for advanced * pool recovery cases since missing data is almost inevitable. Pools with * missing devices can only be imported read-only for safety reasons, and their * fail-mode will be automatically set to "continue". * * With 1 missing vdev we should be able to import the pool and mount all * datasets. User data that was not modified after the missing device has been * added should be recoverable. This means that snapshots created prior to the * addition of that device should be completely intact. * * With 2 missing vdevs, some datasets may fail to mount since there are * dataset statistics that are stored as regular metadata. Some data might be * recoverable if those vdevs were added recently. * * With 3 or more missing vdevs, the pool is severely damaged and MOS entries * may be missing entirely. Chances of data recovery are very low. Note that * there are also risks of performing an inadvertent rewind as we might be * missing all the vdevs with the latest uberblocks. */ unsigned long zfs_max_missing_tvds = 0; /* * The parameters below are similar to zfs_max_missing_tvds but are only * intended for a preliminary open of the pool with an untrusted config which * might be incomplete or out-dated. * * We are more tolerant for pools opened from a cachefile since we could have * an out-dated cachefile where a device removal was not registered. * We could have set the limit arbitrarily high but in the case where devices * are really missing we would want to return the proper error codes; we chose * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available * and we get a chance to retrieve the trusted config. */ uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; /* * In the case where config was assembled by scanning device paths (/dev/dsks * by default) we are less tolerant since all the existing devices should have * been detected and we want spa_load to return the right error codes. */ uint64_t zfs_max_missing_tvds_scan = 0; /* * Debugging aid that pauses spa_sync() towards the end. */ boolean_t zfs_pause_spa_sync = B_FALSE; /* * Variables to indicate the livelist condense zthr func should wait at certain * points for the livelist to be removed - used to test condense/destroy races */ int zfs_livelist_condense_zthr_pause = 0; int zfs_livelist_condense_sync_pause = 0; /* * Variables to track whether or not condense cancellation has been * triggered in testing. */ int zfs_livelist_condense_sync_cancel = 0; int zfs_livelist_condense_zthr_cancel = 0; /* * Variable to track whether or not extra ALLOC blkptrs were added to a * livelist entry while it was being condensed (caused by the way we track * remapped blkptrs in dbuf_remap_impl) */ int zfs_livelist_condense_new_alloc = 0; /* * ========================================================================== * SPA properties routines * ========================================================================== */ /* * Add a (source=src, propname=propval) list to an nvlist. */ static void spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, uint64_t intval, zprop_source_t src) { const char *propname = zpool_prop_to_name(prop); nvlist_t *propval; VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); if (strval != NULL) VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); else VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); nvlist_free(propval); } /* * Get property values from the spa configuration. */ static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { vdev_t *rvd = spa->spa_root_vdev; dsl_pool_t *pool = spa->spa_dsl_pool; uint64_t size, alloc, cap, version; const zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; metaslab_class_t *mc = spa_normal_class(spa); ASSERT(MUTEX_HELD(&spa->spa_props_lock)); if (rvd != NULL) { alloc = metaslab_class_get_alloc(mc); alloc += metaslab_class_get_alloc(spa_special_class(spa)); alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); size = metaslab_class_get_space(mc); size += metaslab_class_get_space(spa_special_class(spa)); size += metaslab_class_get_space(spa_dedup_class(spa)); spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, size - alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, spa->spa_checkpoint_info.sci_dspace, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, metaslab_class_fragmentation(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, metaslab_class_expandable_space(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, (spa_mode(spa) == SPA_MODE_READ), src); cap = (size == 0) ? 0 : (alloc * 100 / size); spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, ddt_get_pool_dedup_ratio(spa), src); spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, rvd->vdev_state, src); version = spa_version(spa); if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, ZPROP_SRC_DEFAULT); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, ZPROP_SRC_LOCAL); } spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, NULL, spa_load_guid(spa), src); } if (pool != NULL) { /* * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, * when opening pools before this version freedir will be NULL. */ if (pool->dp_free_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 0, src); } if (pool->dp_leak_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 0, src); } } spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); if (spa->spa_comment != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 0, ZPROP_SRC_LOCAL); } if (spa->spa_root != NULL) spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 0, ZPROP_SRC_LOCAL); if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); } if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, DNODE_MAX_SIZE, ZPROP_SRC_NONE); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, DNODE_MIN_SIZE, ZPROP_SRC_NONE); } if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, "none", 0, ZPROP_SRC_LOCAL); } else if (strcmp(dp->scd_path, spa_config_path) != 0) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, dp->scd_path, 0, ZPROP_SRC_LOCAL); } } } /* * Get zpool property values. */ int spa_prop_get(spa_t *spa, nvlist_t **nvp) { objset_t *mos = spa->spa_meta_objset; zap_cursor_t zc; zap_attribute_t za; dsl_pool_t *dp; int err; err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); if (err) return (err); dp = spa_get_dsl(spa); dsl_pool_config_enter(dp, FTAG); mutex_enter(&spa->spa_props_lock); /* * Get properties from the spa config. */ spa_prop_get_config(spa, nvp); /* If no pool property object, no more prop to get. */ if (mos == NULL || spa->spa_pool_props_object == 0) goto out; /* * Get properties from the MOS pool property object. */ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); (err = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t intval = 0; char *strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; zpool_prop_t prop; if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) continue; switch (za.za_integer_length) { case 8: /* integer property */ if (za.za_first_integer != zpool_prop_default_numeric(prop)) src = ZPROP_SRC_LOCAL; if (prop == ZPOOL_PROP_BOOTFS) { dsl_dataset_t *ds = NULL; err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &ds); if (err != 0) break; strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, strval); dsl_dataset_rele(ds, FTAG); } else { strval = NULL; intval = za.za_first_integer; } spa_prop_add_list(*nvp, prop, strval, intval, src); if (strval != NULL) kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); break; case 1: /* string property */ strval = kmem_alloc(za.za_num_integers, KM_SLEEP); err = zap_lookup(mos, spa->spa_pool_props_object, za.za_name, 1, za.za_num_integers, strval); if (err) { kmem_free(strval, za.za_num_integers); break; } spa_prop_add_list(*nvp, prop, strval, 0, src); kmem_free(strval, za.za_num_integers); break; default: break; } } zap_cursor_fini(&zc); out: mutex_exit(&spa->spa_props_lock); dsl_pool_config_exit(dp, FTAG); if (err && err != ENOENT) { nvlist_free(*nvp); *nvp = NULL; return (err); } return (0); } /* * Validate the given pool properties nvlist and modify the list * for the property values to be set. */ static int spa_prop_validate(spa_t *spa, nvlist_t *props) { nvpair_t *elem; int error = 0, reset_bootfs = 0; uint64_t objnum = 0; boolean_t has_feature = B_FALSE; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { uint64_t intval; char *strval, *slash, *check, *fname; const char *propname = nvpair_name(elem); zpool_prop_t prop = zpool_name_to_prop(propname); switch (prop) { case ZPOOL_PROP_INVAL: if (!zpool_prop_feature(propname)) { error = SET_ERROR(EINVAL); break; } /* * Sanitize the input. */ if (nvpair_type(elem) != DATA_TYPE_UINT64) { error = SET_ERROR(EINVAL); break; } if (nvpair_value_uint64(elem, &intval) != 0) { error = SET_ERROR(EINVAL); break; } if (intval != 0) { error = SET_ERROR(EINVAL); break; } fname = strchr(propname, '@') + 1; if (zfeature_lookup_name(fname, NULL) != 0) { error = SET_ERROR(EINVAL); break; } has_feature = B_TRUE; break; case ZPOOL_PROP_VERSION: error = nvpair_value_uint64(elem, &intval); if (!error && (intval < spa_version(spa) || intval > SPA_VERSION_BEFORE_FEATURES || has_feature)) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_DELEGATION: case ZPOOL_PROP_AUTOREPLACE: case ZPOOL_PROP_LISTSNAPS: case ZPOOL_PROP_AUTOEXPAND: case ZPOOL_PROP_AUTOTRIM: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_MULTIHOST: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = SET_ERROR(EINVAL); if (!error) { uint32_t hostid = zone_get_hostid(NULL); if (hostid) spa->spa_hostid = hostid; else error = SET_ERROR(ENOTSUP); } break; case ZPOOL_PROP_BOOTFS: /* * If the pool version is less than SPA_VERSION_BOOTFS, * or the pool is still being created (version == 0), * the bootfs property cannot be set. */ if (spa_version(spa) < SPA_VERSION_BOOTFS) { error = SET_ERROR(ENOTSUP); break; } /* * Make sure the vdev config is bootable */ if (!vdev_is_bootable(spa->spa_root_vdev)) { error = SET_ERROR(ENOTSUP); break; } reset_bootfs = 1; error = nvpair_value_string(elem, &strval); if (!error) { objset_t *os; if (strval == NULL || strval[0] == '\0') { objnum = zpool_prop_default_numeric( ZPOOL_PROP_BOOTFS); break; } error = dmu_objset_hold(strval, FTAG, &os); if (error != 0) break; /* Must be ZPL. */ if (dmu_objset_type(os) != DMU_OST_ZFS) { error = SET_ERROR(ENOTSUP); } else { objnum = dmu_objset_id(os); } dmu_objset_rele(os, FTAG); } break; case ZPOOL_PROP_FAILUREMODE: error = nvpair_value_uint64(elem, &intval); if (!error && intval > ZIO_FAILURE_MODE_PANIC) error = SET_ERROR(EINVAL); /* * This is a special case which only occurs when * the pool has completely failed. This allows * the user to change the in-core failmode property * without syncing it out to disk (I/Os might * currently be blocked). We do this by returning * EIO to the caller (spa_prop_set) to trick it * into thinking we encountered a property validation * error. */ if (!error && spa_suspended(spa)) { spa->spa_failmode = intval; error = SET_ERROR(EIO); } break; case ZPOOL_PROP_CACHEFILE: if ((error = nvpair_value_string(elem, &strval)) != 0) break; if (strval[0] == '\0') break; if (strcmp(strval, "none") == 0) break; if (strval[0] != '/') { error = SET_ERROR(EINVAL); break; } slash = strrchr(strval, '/'); ASSERT(slash != NULL); if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || strcmp(slash, "/..") == 0) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_COMMENT: if ((error = nvpair_value_string(elem, &strval)) != 0) break; for (check = strval; *check != '\0'; check++) { if (!isprint(*check)) { error = SET_ERROR(EINVAL); break; } } if (strlen(strval) > ZPROP_MAX_COMMENT) error = SET_ERROR(E2BIG); break; default: break; } if (error) break; } (void) nvlist_remove_all(props, zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); if (!error && reset_bootfs) { error = nvlist_remove(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); if (!error) { error = nvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); } } return (error); } void spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) { char *cachefile; spa_config_dirent_t *dp; if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), &cachefile) != 0) return; dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP); if (cachefile[0] == '\0') dp->scd_path = spa_strdup(spa_config_path); else if (strcmp(cachefile, "none") == 0) dp->scd_path = NULL; else dp->scd_path = spa_strdup(cachefile); list_insert_head(&spa->spa_config_list, dp); if (need_sync) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; nvpair_t *elem = NULL; boolean_t need_sync = B_FALSE; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT || prop == ZPOOL_PROP_READONLY) continue; if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { uint64_t ver; if (prop == ZPOOL_PROP_VERSION) { VERIFY(nvpair_value_uint64(elem, &ver) == 0); } else { ASSERT(zpool_prop_feature(nvpair_name(elem))); ver = SPA_VERSION_FEATURES; need_sync = B_TRUE; } /* Save time if the version is already set. */ if (ver == spa_version(spa)) continue; /* * In addition to the pool directory object, we might * create the pool properties object, the features for * read object, the features for write object, or the * feature descriptions object. */ error = dsl_sync_task(spa->spa_name, NULL, spa_sync_version, &ver, 6, ZFS_SPACE_CHECK_RESERVED); if (error) return (error); continue; } need_sync = B_TRUE; break; } if (need_sync) { return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, nvp, 6, ZFS_SPACE_CHECK_RESERVED)); } return (0); } /* * If the bootfs property value is dsobj, clear it. */ void spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) { if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { VERIFY(zap_remove(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); spa->spa_bootfs = 0; } } /*ARGSUSED*/ static int spa_change_guid_check(void *arg, dmu_tx_t *tx) { uint64_t *newguid __maybe_unused = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t vdev_state; if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { int error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (SET_ERROR(error)); } spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_state = rvd->vdev_state; spa_config_exit(spa, SCL_STATE, FTAG); if (vdev_state != VDEV_STATE_HEALTHY) return (SET_ERROR(ENXIO)); ASSERT3U(spa_guid(spa), !=, *newguid); return (0); } static void spa_change_guid_sync(void *arg, dmu_tx_t *tx) { uint64_t *newguid = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; uint64_t oldguid; vdev_t *rvd = spa->spa_root_vdev; oldguid = spa_guid(spa); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); rvd->vdev_guid = *newguid; rvd->vdev_guid_sum += (*newguid - oldguid); vdev_config_dirty(rvd); spa_config_exit(spa, SCL_STATE, FTAG); spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", (u_longlong_t)oldguid, (u_longlong_t)*newguid); } /* * Change the GUID for the pool. This is done so that we can later * re-import a pool built from a clone of our own vdevs. We will modify * the root vdev's guid, our own pool guid, and then mark all of our * vdevs dirty. Note that we must make sure that all our vdevs are * online when we do this, or else any vdevs that weren't present * would be orphaned from our pool. We are also going to issue a * sysevent to update any watchers. */ int spa_change_guid(spa_t *spa) { int error; uint64_t guid; mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); guid = spa_generate_guid(NULL); error = dsl_sync_task(spa->spa_name, spa_change_guid_check, spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); if (error == 0) { spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); } mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * ========================================================================== * SPA state manipulation (open/create/destroy/import/export) * ========================================================================== */ static int spa_error_entry_compare(const void *a, const void *b) { const spa_error_entry_t *sa = (const spa_error_entry_t *)a; const spa_error_entry_t *sb = (const spa_error_entry_t *)b; int ret; ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_phys_t)); return (TREE_ISIGN(ret)); } /* * Utility function which retrieves copies of the current logs and * re-initializes them in the process. */ void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) { ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); } static void spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; enum zti_modes mode = ztip->zti_mode; uint_t value = ztip->zti_value; uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t flags = 0; boolean_t batch = B_FALSE; if (mode == ZTI_MODE_NULL) { tqs->stqs_count = 0; tqs->stqs_taskq = NULL; return; } ASSERT3U(count, >, 0); tqs->stqs_count = count; tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); switch (mode) { case ZTI_MODE_FIXED: ASSERT3U(value, >=, 1); value = MAX(value, 1); flags |= TASKQ_DYNAMIC; break; case ZTI_MODE_BATCH: batch = B_TRUE; flags |= TASKQ_THREADS_CPU_PCT; value = MIN(zio_taskq_batch_pct, 100); break; default: panic("unrecognized mode for %s_%s taskq (%u:%u) in " "spa_activate()", zio_type_name[t], zio_taskq_types[q], mode, value); break; } for (uint_t i = 0; i < count; i++) { taskq_t *tq; char name[32]; (void) snprintf(name, sizeof (name), "%s_%s", zio_type_name[t], zio_taskq_types[q]); if (zio_taskq_sysdc && spa->spa_proc != &p0) { if (batch) flags |= TASKQ_DC_BATCH; tq = taskq_create_sysdc(name, value, 50, INT_MAX, spa->spa_proc, zio_taskq_basedc, flags); } else { pri_t pri = maxclsyspri; /* * The write issue taskq can be extremely CPU * intensive. Run it at slightly less important * priority than the other taskqs. Under Linux this * means incrementing the priority value on platforms * like illumos it should be decremented. */ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) pri++; tq = taskq_create_proc(name, value, pri, 50, INT_MAX, spa->spa_proc, flags); } tqs->stqs_taskq[i] = tq; } } static void spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; if (tqs->stqs_taskq == NULL) { ASSERT3U(tqs->stqs_count, ==, 0); return; } for (uint_t i = 0; i < tqs->stqs_count; i++) { ASSERT3P(tqs->stqs_taskq[i], !=, NULL); taskq_destroy(tqs->stqs_taskq[i]); } kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); tqs->stqs_taskq = NULL; } /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention * on the taskq itself. In that case we choose which taskq at random by using * the low bits of gethrtime(). */ void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } taskq_dispatch_ent(tq, func, arg, flags, ent); } /* * Same as spa_taskq_dispatch_ent() but block on the task until completion. */ void spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; taskqid_t id; ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } id = taskq_dispatch(tq, func, arg, flags); if (id) taskq_wait_id(tq, id); } static void spa_create_zio_taskqs(spa_t *spa) { for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_init(spa, t, q); } } } /* * Disabled until spa_thread() can be adapted for Linux. */ #undef HAVE_SPA_THREAD #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) static void spa_thread(void *arg) { psetid_t zio_taskq_psrset_bind = PS_NONE; callb_cpr_t cprinfo; spa_t *spa = arg; user_t *pu = PTOU(curproc); CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, spa->spa_name); ASSERT(curproc != &p0); (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), "zpool-%s", spa->spa_name); (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); /* bind this thread to the requested psrset */ if (zio_taskq_psrset_bind != PS_NONE) { pool_lock(); mutex_enter(&cpu_lock); mutex_enter(&pidlock); mutex_enter(&curproc->p_lock); if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 0, NULL, NULL) == 0) { curthread->t_bind_pset = zio_taskq_psrset_bind; } else { cmn_err(CE_WARN, "Couldn't bind process for zfs pool \"%s\" to " "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); } mutex_exit(&curproc->p_lock); mutex_exit(&pidlock); mutex_exit(&cpu_lock); pool_unlock(); } if (zio_taskq_sysdc) { sysdc_thread_enter(curthread, 100, 0); } spa->spa_proc = curproc; spa->spa_did = curthread->t_did; spa_create_zio_taskqs(spa); mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); spa->spa_proc_state = SPA_PROC_ACTIVE; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_SAFE_BEGIN(&cprinfo); while (spa->spa_proc_state == SPA_PROC_ACTIVE) cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); spa->spa_proc_state = SPA_PROC_GONE; spa->spa_proc = &p0; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ mutex_enter(&curproc->p_lock); lwp_exit(); } #endif /* * Activate an uninitialized pool. */ static void spa_activate(spa_t *spa, spa_mode_t mode) { ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; spa->spa_mode = mode; spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops); /* Try to create a covering process */ mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_NONE); ASSERT(spa->spa_proc == &p0); spa->spa_did = 0; #ifdef HAVE_SPA_THREAD /* Only create a process if we're going to be around a while. */ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, NULL, 0) == 0) { spa->spa_proc_state = SPA_PROC_CREATED; while (spa->spa_proc_state == SPA_PROC_CREATED) { cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); ASSERT(spa->spa_proc != &p0); ASSERT(spa->spa_did != 0); } else { #ifdef _KERNEL cmn_err(CE_WARN, "Couldn't create process for zfs pool \"%s\"\n", spa->spa_name); #endif } } #endif /* HAVE_SPA_THREAD */ mutex_exit(&spa->spa_proc_lock); /* If we didn't create a process, we need to create our taskqs. */ if (spa->spa_proc == &p0) { spa_create_zio_taskqs(spa); } for (size_t i = 0; i < TXG_SIZE; i++) { spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); } list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_evicting_os_list, sizeof (objset_t), offsetof(objset_t, os_evicting_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_state_dirty_node)); txg_list_create(&spa->spa_vdev_txg_list, spa, offsetof(struct vdev, vdev_txg_node)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); spa_keystore_init(&spa->spa_keystore); /* * This taskq is used to perform zvol-minor-related tasks * asynchronously. This has several advantages, including easy * resolution of various deadlocks (zfsonlinux bug #3681). * * The taskq must be single threaded to ensure tasks are always * processed in the order in which they were dispatched. * * A taskq per pool allows one to keep the pools independent. * This way if one pool is suspended, it will not impact another. * * The preferred location to dispatch a zvol minor task is a sync * task. In this context, there is easy access to the spa_t and minimal * error handling is required because the sync task must succeed. */ spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1, INT_MAX, 0); /* * Taskq dedicated to prefetcher threads: this is used to prevent the * pool traverse code from monopolizing the global (and limited) * system_taskq by inappropriately scheduling long running tasks on it. */ spa->spa_prefetch_taskq = taskq_create("z_prefetch", boot_ncpus, defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC); /* * The taskq to upgrade datasets in this pool. Currently used by * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. */ spa->spa_upgrade_taskq = taskq_create("z_upgrade", boot_ncpus, defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC); } /* * Opposite of spa_activate(). */ static void spa_deactivate(spa_t *spa) { ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); spa_evicting_os_wait(spa); if (spa->spa_zvol_taskq) { taskq_destroy(spa->spa_zvol_taskq); spa->spa_zvol_taskq = NULL; } if (spa->spa_prefetch_taskq) { taskq_destroy(spa->spa_prefetch_taskq); spa->spa_prefetch_taskq = NULL; } if (spa->spa_upgrade_taskq) { taskq_destroy(spa->spa_upgrade_taskq); spa->spa_upgrade_taskq = NULL; } txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_state_dirty_list); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_fini(spa, t, q); } } for (size_t i = 0; i < TXG_SIZE; i++) { ASSERT3P(spa->spa_txg_zio[i], !=, NULL); VERIFY0(zio_wait(spa->spa_txg_zio[i])); spa->spa_txg_zio[i] = NULL; } metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; metaslab_class_destroy(spa->spa_log_class); spa->spa_log_class = NULL; metaslab_class_destroy(spa->spa_special_class); spa->spa_special_class = NULL; metaslab_class_destroy(spa->spa_dedup_class); spa->spa_dedup_class = NULL; /* * If this was part of an import or the open otherwise failed, we may * still have errors left in the queues. Empty them just in case. */ spa_errlog_drain(spa); avl_destroy(&spa->spa_errlist_scrub); avl_destroy(&spa->spa_errlist_last); spa_keystore_fini(&spa->spa_keystore); spa->spa_state = POOL_STATE_UNINITIALIZED; mutex_enter(&spa->spa_proc_lock); if (spa->spa_proc_state != SPA_PROC_NONE) { ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); spa->spa_proc_state = SPA_PROC_DEACTIVATE; cv_broadcast(&spa->spa_proc_cv); while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { ASSERT(spa->spa_proc != &p0); cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_GONE); spa->spa_proc_state = SPA_PROC_NONE; } ASSERT(spa->spa_proc == &p0); mutex_exit(&spa->spa_proc_lock); /* * We want to make sure spa_thread() has actually exited the ZFS * module, so that the module can't be unloaded out from underneath * it. */ if (spa->spa_did != 0) { thread_join(spa->spa_did); spa->spa_did = 0; } } /* * Verify a pool configuration, and construct the vdev tree appropriately. This * will create all the necessary vdevs in the appropriate layout, with each vdev * in the CLOSED state. This will prep the pool before open/creation/import. * All vdev validation is done by the vdev_alloc() routine. */ int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) { nvlist_t **child; uint_t children; int error; if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) return (error); if ((*vdp)->vdev_ops->vdev_op_leaf) return (0); error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children); if (error == ENOENT) return (0); if (error) { vdev_free(*vdp); *vdp = NULL; return (SET_ERROR(EINVAL)); } for (int c = 0; c < children; c++) { vdev_t *vd; if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, atype)) != 0) { vdev_free(*vdp); *vdp = NULL; return (error); } } ASSERT(*vdp != NULL); return (0); } static boolean_t spa_should_flush_logs_on_unload(spa_t *spa) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return (B_FALSE); if (!spa_writeable(spa)) return (B_FALSE); if (!spa->spa_sync_on) return (B_FALSE); if (spa_state(spa) != POOL_STATE_EXPORTED) return (B_FALSE); if (zfs_keep_log_spacemaps_at_export) return (B_FALSE); return (B_TRUE); } /* * Opens a transaction that will set the flag that will instruct * spa_sync to attempt to flush all the metaslabs for that txg. */ static void spa_unload_log_sm_flush_all(spa_t *spa) { dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); ASSERT3U(spa->spa_log_flushall_txg, ==, 0); spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); dmu_tx_commit(tx); txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); } static void spa_unload_log_sm_metadata(spa_t *spa) { void *cookie = NULL; spa_log_sm_t *sls; while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, &cookie)) != NULL) { VERIFY0(sls->sls_mscount); kmem_free(sls, sizeof (spa_log_sm_t)); } for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); e != NULL; e = list_head(&spa->spa_log_summary)) { VERIFY0(e->lse_mscount); list_remove(&spa->spa_log_summary, e); kmem_free(e, sizeof (log_summary_entry_t)); } spa->spa_unflushed_stats.sus_nblocks = 0; spa->spa_unflushed_stats.sus_memused = 0; spa->spa_unflushed_stats.sus_blocklimit = 0; } static void spa_destroy_aux_threads(spa_t *spa) { if (spa->spa_condense_zthr != NULL) { zthr_destroy(spa->spa_condense_zthr); spa->spa_condense_zthr = NULL; } if (spa->spa_checkpoint_discard_zthr != NULL) { zthr_destroy(spa->spa_checkpoint_discard_zthr); spa->spa_checkpoint_discard_zthr = NULL; } if (spa->spa_livelist_delete_zthr != NULL) { zthr_destroy(spa->spa_livelist_delete_zthr); spa->spa_livelist_delete_zthr = NULL; } if (spa->spa_livelist_condense_zthr != NULL) { zthr_destroy(spa->spa_livelist_condense_zthr); spa->spa_livelist_condense_zthr = NULL; } } /* * Opposite of spa_load(). */ static void spa_unload(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); spa_import_progress_remove(spa_guid(spa)); spa_load_note(spa, "UNLOADING"); spa_wake_waiters(spa); /* * If the log space map feature is enabled and the pool is getting * exported (but not destroyed), we want to spend some time flushing * as many metaslabs as we can in an attempt to destroy log space * maps and save import time. */ if (spa_should_flush_logs_on_unload(spa)) spa_unload_log_sm_flush_all(spa); /* * Stop async tasks. */ spa_async_suspend(spa); if (spa->spa_root_vdev) { vdev_t *root_vdev = spa->spa_root_vdev; vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); vdev_rebuild_stop_all(spa); } /* * Stop syncing. */ if (spa->spa_sync_on) { txg_sync_stop(spa->spa_dsl_pool); spa->spa_sync_on = B_FALSE; } /* * This ensures that there is no async metaslab prefetching * while we attempt to unload the spa. */ if (spa->spa_root_vdev != NULL) { for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { vdev_t *vc = spa->spa_root_vdev->vdev_child[c]; if (vc->vdev_mg != NULL) taskq_wait(vc->vdev_mg->mg_taskq); } } if (spa->spa_mmp.mmp_thread) mmp_thread_stop(spa); /* * Wait for any outstanding async I/O to complete. */ if (spa->spa_async_zio_root != NULL) { for (int i = 0; i < max_ncpus; i++) (void) zio_wait(spa->spa_async_zio_root[i]); kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); spa->spa_async_zio_root = NULL; } if (spa->spa_vdev_removal != NULL) { spa_vdev_removal_destroy(spa->spa_vdev_removal); spa->spa_vdev_removal = NULL; } spa_destroy_aux_threads(spa); spa_condense_fini(spa); bpobj_close(&spa->spa_deferred_bpobj); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); /* * Close all vdevs. */ if (spa->spa_root_vdev) vdev_free(spa->spa_root_vdev); ASSERT(spa->spa_root_vdev == NULL); /* * Close the dsl pool. */ if (spa->spa_dsl_pool) { dsl_pool_close(spa->spa_dsl_pool); spa->spa_dsl_pool = NULL; spa->spa_meta_objset = NULL; } ddt_unload(spa); spa_unload_log_sm_metadata(spa); /* * Drop and purge level 2 cache */ spa_l2cache_drop(spa); for (int i = 0; i < spa->spa_spares.sav_count; i++) vdev_free(spa->spa_spares.sav_vdevs[i]); if (spa->spa_spares.sav_vdevs) { kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); spa->spa_spares.sav_vdevs = NULL; } if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; } spa->spa_spares.sav_count = 0; for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); vdev_free(spa->spa_l2cache.sav_vdevs[i]); } if (spa->spa_l2cache.sav_vdevs) { kmem_free(spa->spa_l2cache.sav_vdevs, spa->spa_l2cache.sav_count * sizeof (void *)); spa->spa_l2cache.sav_vdevs = NULL; } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; } spa->spa_l2cache.sav_count = 0; spa->spa_async_suspended = 0; spa->spa_indirect_vdevs_loaded = B_FALSE; if (spa->spa_comment != NULL) { spa_strfree(spa->spa_comment); spa->spa_comment = NULL; } spa_config_exit(spa, SCL_ALL, spa); } /* * Load (or re-load) the current list of vdevs describing the active spares for * this pool. When this is called, we have some form of basic information in * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. */ void spa_load_spares(spa_t *spa) { nvlist_t **spares; uint_t nspares; int i; vdev_t *vd, *tvd; #ifndef _KERNEL /* * zdb opens both the current state of the pool and the * checkpointed state (if present), with a different spa_t. * * As spare vdevs are shared among open pools, we skip loading * them when we load the checkpointed state of the pool. */ if (!spa_writeable(spa)) return; #endif ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * First, close and free any existing spare vdevs. */ for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; /* Undo the call to spa_activate() below */ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL && tvd->vdev_isspare) spa_spare_remove(tvd); vdev_close(vd); vdev_free(vd); } if (spa->spa_spares.sav_vdevs) kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); if (spa->spa_spares.sav_config == NULL) nspares = 0; else VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); spa->spa_spares.sav_count = (int)nspares; spa->spa_spares.sav_vdevs = NULL; if (nspares == 0) return; /* * Construct the array of vdevs, opening them to get status in the * process. For each spare, there is potentially two different vdev_t * structures associated with it: one in the list of spares (used only * for basic validation purposes) and one in the active vdev * configuration (if it's spared in). During this phase we open and * validate each vdev on the spare list. If the vdev also exists in the * active configuration, then we also mark this vdev as an active spare. */ spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) { VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, VDEV_ALLOC_SPARE) == 0); ASSERT(vd != NULL); spa->spa_spares.sav_vdevs[i] = vd; if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL) { if (!tvd->vdev_isspare) spa_spare_add(tvd); /* * We only mark the spare active if we were successfully * able to load the vdev. Otherwise, importing a pool * with a bad active spare would result in strange * behavior, because multiple pool would think the spare * is actively in use. * * There is a vulnerability here to an equally bizarre * circumstance, where a dead active spare is later * brought back to life (onlined or otherwise). Given * the rarity of this scenario, and the extra complexity * it adds, we ignore the possibility. */ if (!vdev_is_dead(tvd)) spa_spare_activate(tvd); } vd->vdev_top = vd; vd->vdev_aux = &spa->spa_spares; if (vdev_open(vd) != 0) continue; if (vdev_validate_aux(vd) == 0) spa_spare_add(vd); } /* * Recompute the stashed list of spares, with status information * this time. */ VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) spares[i] = vdev_config_generate(spa, spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); for (i = 0; i < spa->spa_spares.sav_count; i++) nvlist_free(spares[i]); kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); } /* * Load (or re-load) the current list of vdevs describing the active l2cache for * this pool. When this is called, we have some form of basic information in * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. * Devices which are already active have their details maintained, and are * not re-opened. */ void spa_load_l2cache(spa_t *spa) { nvlist_t **l2cache = NULL; uint_t nl2cache; int i, j, oldnvdevs; uint64_t guid; vdev_t *vd, **oldvdevs, **newvdevs; spa_aux_vdev_t *sav = &spa->spa_l2cache; #ifndef _KERNEL /* * zdb opens both the current state of the pool and the * checkpointed state (if present), with a different spa_t. * * As L2 caches are part of the ARC which is shared among open * pools, we skip loading them when we load the checkpointed * state of the pool. */ if (!spa_writeable(spa)) return; #endif ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); oldvdevs = sav->sav_vdevs; oldnvdevs = sav->sav_count; sav->sav_vdevs = NULL; sav->sav_count = 0; if (sav->sav_config == NULL) { nl2cache = 0; newvdevs = NULL; goto out; } VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); /* * Process new nvlist of vdevs. */ for (i = 0; i < nl2cache; i++) { VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, &guid) == 0); newvdevs[i] = NULL; for (j = 0; j < oldnvdevs; j++) { vd = oldvdevs[j]; if (vd != NULL && guid == vd->vdev_guid) { /* * Retain previous vdev for add/remove ops. */ newvdevs[i] = vd; oldvdevs[j] = NULL; break; } } if (newvdevs[i] == NULL) { /* * Create new vdev */ VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, VDEV_ALLOC_L2CACHE) == 0); ASSERT(vd != NULL); newvdevs[i] = vd; /* * Commit this vdev as an l2cache device, * even if it fails to open. */ spa_l2cache_add(vd); vd->vdev_top = vd; vd->vdev_aux = sav; spa_l2cache_activate(vd); if (vdev_open(vd) != 0) continue; (void) vdev_validate_aux(vd); if (!vdev_is_dead(vd)) l2arc_add_vdev(spa, vd); /* * Upon cache device addition to a pool or pool * creation with a cache device or if the header * of the device is invalid we issue an async * TRIM command for the whole device which will * execute if l2arc_trim_ahead > 0. */ spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } sav->sav_vdevs = newvdevs; sav->sav_count = (int)nl2cache; /* * Recompute the stashed list of l2cache devices, with status * information this time. */ VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); if (sav->sav_count > 0) l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); out: /* * Purge vdevs that were dropped */ for (i = 0; i < oldnvdevs; i++) { uint64_t pool; vd = oldvdevs[i]; if (vd != NULL) { ASSERT(vd->vdev_isl2cache); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); vdev_clear_stats(vd); vdev_free(vd); } } if (oldvdevs) kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); for (i = 0; i < sav->sav_count; i++) nvlist_free(l2cache[i]); if (sav->sav_count) kmem_free(l2cache, sav->sav_count * sizeof (void *)); } static int load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) { dmu_buf_t *db; char *packed = NULL; size_t nvsize = 0; int error; *value = NULL; error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); if (error) return (error); nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); packed = vmem_alloc(nvsize, KM_SLEEP); error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, DMU_READ_PREFETCH); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); vmem_free(packed, nvsize); return (error); } /* * Concrete top-level vdevs that are not missing and are not logs. At every * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. */ static uint64_t spa_healthy_core_tvds(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; uint64_t tvds = 0; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; if (vd->vdev_islog) continue; if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) tvds++; } return (tvds); } /* * Checks to see if the given vdev could not be opened, in which case we post a * sysevent to notify the autoreplace code that the device has been removed. */ static void spa_check_removed(vdev_t *vd) { for (uint64_t c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && vdev_is_concrete(vd)) { zfs_post_autoreplace(vd->vdev_spa, vd); spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); } } static int spa_check_for_missing_logs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; /* * If we're doing a normal import, then build up any additional * diagnostic information about missing log devices. * We'll pass this up to the user for further processing. */ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { nvlist_t **child, *nv; uint64_t idx = 0; child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; /* * We consider a device as missing only if it failed * to open (i.e. offline or faulted is not considered * as missing). */ if (tvd->vdev_islog && tvd->vdev_state == VDEV_STATE_CANT_OPEN) { child[idx++] = vdev_config_generate(spa, tvd, B_FALSE, VDEV_CONFIG_MISSING); } } if (idx > 0) { fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, child, idx); fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_MISSING_DEVICES, nv); for (uint64_t i = 0; i < idx; i++) nvlist_free(child[i]); } nvlist_free(nv); kmem_free(child, rvd->vdev_children * sizeof (char **)); if (idx > 0) { spa_load_failed(spa, "some log devices are missing"); vdev_dbgmsg_print_tree(rvd, 2); return (SET_ERROR(ENXIO)); } } else { for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_islog && tvd->vdev_state == VDEV_STATE_CANT_OPEN) { spa_set_log_state(spa, SPA_LOG_CLEAR); spa_load_note(spa, "some log devices are " "missing, ZIL is dropped."); vdev_dbgmsg_print_tree(rvd, 2); break; } } } return (0); } /* * Check for missing log devices */ static boolean_t spa_check_logs(spa_t *spa) { boolean_t rv = B_FALSE; dsl_pool_t *dp = spa_get_dsl(spa); switch (spa->spa_log_state) { default: break; case SPA_LOG_MISSING: /* need to recheck in case slog has been restored */ case SPA_LOG_UNKNOWN: rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); if (rv) spa_set_log_state(spa, SPA_LOG_MISSING); break; } return (rv); } static boolean_t spa_passivate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; boolean_t slog_found = B_FALSE; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); if (!spa_has_slogs(spa)) return (B_FALSE); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) { metaslab_group_passivate(mg); slog_found = B_TRUE; } } return (slog_found); } static void spa_activate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) metaslab_group_activate(mg); } } int spa_reset_logs(spa_t *spa) { int error; error = dmu_objset_find(spa_name(spa), zil_reset, NULL, DS_FIND_CHILDREN); if (error == 0) { /* * We successfully offlined the log device, sync out the * current txg so that the "stubby" block can be removed * by zil_sync(). */ txg_wait_synced(spa->spa_dsl_pool, 0); } return (error); } static void spa_aux_check_removed(spa_aux_vdev_t *sav) { for (int i = 0; i < sav->sav_count; i++) spa_check_removed(sav->sav_vdevs[i]); } void spa_claim_notify(zio_t *zio) { spa_t *spa = zio->io_spa; if (zio->io_error) return; mutex_enter(&spa->spa_props_lock); /* any mutex will do */ if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) spa->spa_claim_max_txg = zio->io_bp->blk_birth; mutex_exit(&spa->spa_props_lock); } typedef struct spa_load_error { uint64_t sle_meta_count; uint64_t sle_data_count; } spa_load_error_t; static void spa_load_verify_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; spa_load_error_t *sle = zio->io_private; dmu_object_type_t type = BP_GET_TYPE(bp); int error = zio->io_error; spa_t *spa = zio->io_spa; abd_free(zio->io_abd); if (error) { if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) atomic_inc_64(&sle->sle_meta_count); else atomic_inc_64(&sle->sle_data_count); } mutex_enter(&spa->spa_scrub_lock); spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } /* * Maximum number of inflight bytes is the log2 fraction of the arc size. * By default, we set it to 1/16th of the arc. */ int spa_load_verify_shift = 4; int spa_load_verify_metadata = B_TRUE; int spa_load_verify_data = B_TRUE; /*ARGSUSED*/ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) return (0); /* * Note: normally this routine will not be called if * spa_load_verify_metadata is not set. However, it may be useful * to manually set the flag after the traversal has begun. */ if (!spa_load_verify_metadata) return (0); if (!BP_IS_METADATA(bp) && !spa_load_verify_data) return (0); uint64_t maxinflight_bytes = arc_target_bytes() >> spa_load_verify_shift; zio_t *rio = arg; size_t size = BP_GET_PSIZE(bp); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_load_verify_bytes >= maxinflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_load_verify_bytes += size; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); return (0); } /* ARGSUSED */ static int verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int spa_load_verify(spa_t *spa) { zio_t *rio; spa_load_error_t sle = { 0 }; zpool_load_policy_t policy; boolean_t verify_ok = B_FALSE; int error = 0; zpool_get_load_policy(spa->spa_config, &policy); if (policy.zlp_rewind & ZPOOL_NEVER_REWIND) return (0); dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); error = dmu_objset_find_dp(spa->spa_dsl_pool, spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, DS_FIND_CHILDREN); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); if (error != 0) return (error); rio = zio_root(spa, NULL, &sle, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); if (spa_load_verify_metadata) { if (spa->spa_extreme_rewind) { spa_load_note(spa, "performing a complete scan of the " "pool since extreme rewind is on. This may take " "a very long time.\n (spa_load_verify_data=%u, " "spa_load_verify_metadata=%u)", spa_load_verify_data, spa_load_verify_metadata); } error = traverse_pool(spa, spa->spa_verify_min_txg, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); } (void) zio_wait(rio); ASSERT0(spa->spa_load_verify_bytes); spa->spa_load_meta_errors = sle.sle_meta_count; spa->spa_load_data_errors = sle.sle_data_count; if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { spa_load_note(spa, "spa_load_verify found %llu metadata errors " "and %llu data errors", (u_longlong_t)sle.sle_meta_count, (u_longlong_t)sle.sle_data_count); } if (spa_load_verify_dryrun || (!error && sle.sle_meta_count <= policy.zlp_maxmeta && sle.sle_data_count <= policy.zlp_maxdata)) { int64_t loss = 0; verify_ok = B_TRUE; spa->spa_load_txg = spa->spa_uberblock.ub_txg; spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; VERIFY(nvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); VERIFY(nvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, loss) == 0); VERIFY(nvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); } else { spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } if (spa_load_verify_dryrun) return (0); if (error) { if (error != ENXIO && error != EIO) error = SET_ERROR(EIO); return (error); } return (verify_ok ? 0 : EIO); } /* * Find a value in the pool props object. */ static void spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) { (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); } /* * Find a value in the pool directory object. */ static int spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) { int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, val); if (error != 0 && (error != ENOENT || log_enoent)) { spa_load_failed(spa, "couldn't get '%s' value in MOS directory " "[error=%d]", name, error); } return (error); } static int spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) { vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); return (SET_ERROR(err)); } boolean_t spa_livelist_delete_check(spa_t *spa) { return (spa->spa_livelists_to_delete != 0); } /* ARGSUSED */ static boolean_t spa_livelist_delete_cb_check(void *arg, zthr_t *z) { spa_t *spa = arg; return (spa_livelist_delete_check(spa)); } static int delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { spa_t *spa = arg; zio_free(spa, tx->tx_txg, bp); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); return (0); } static int dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) { int err; zap_cursor_t zc; zap_attribute_t za; zap_cursor_init(&zc, os, zap_obj); err = zap_cursor_retrieve(&zc, &za); zap_cursor_fini(&zc); if (err == 0) *llp = za.za_first_integer; return (err); } /* * Components of livelist deletion that must be performed in syncing * context: freeing block pointers and updating the pool-wide data * structures to indicate how much work is left to do */ typedef struct sublist_delete_arg { spa_t *spa; dsl_deadlist_t *ll; uint64_t key; bplist_t *to_free; } sublist_delete_arg_t; static void sublist_delete_sync(void *arg, dmu_tx_t *tx) { sublist_delete_arg_t *sda = arg; spa_t *spa = sda->spa; dsl_deadlist_t *ll = sda->ll; uint64_t key = sda->key; bplist_t *to_free = sda->to_free; bplist_iterate(to_free, delete_blkptr_cb, spa, tx); dsl_deadlist_remove_entry(ll, key, tx); } typedef struct livelist_delete_arg { spa_t *spa; uint64_t ll_obj; uint64_t zap_obj; } livelist_delete_arg_t; static void livelist_delete_sync(void *arg, dmu_tx_t *tx) { livelist_delete_arg_t *lda = arg; spa_t *spa = lda->spa; uint64_t ll_obj = lda->ll_obj; uint64_t zap_obj = lda->zap_obj; objset_t *mos = spa->spa_meta_objset; uint64_t count; /* free the livelist and decrement the feature count */ VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); dsl_deadlist_free(mos, ll_obj, tx); spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); VERIFY0(zap_count(mos, zap_obj, &count)); if (count == 0) { /* no more livelists to delete */ VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DELETED_CLONES, tx)); VERIFY0(zap_destroy(mos, zap_obj, tx)); spa->spa_livelists_to_delete = 0; spa_notify_waiters(spa); } } /* * Load in the value for the livelist to be removed and open it. Then, * load its first sublist and determine which block pointers should actually * be freed. Then, call a synctask which performs the actual frees and updates * the pool-wide livelist data. */ /* ARGSUSED */ static void spa_livelist_delete_cb(void *arg, zthr_t *z) { spa_t *spa = arg; uint64_t ll_obj = 0, count; objset_t *mos = spa->spa_meta_objset; uint64_t zap_obj = spa->spa_livelists_to_delete; /* * Determine the next livelist to delete. This function should only * be called if there is at least one deleted clone. */ VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); VERIFY0(zap_count(mos, ll_obj, &count)); if (count > 0) { dsl_deadlist_t ll = { 0 }; dsl_deadlist_entry_t *dle; bplist_t to_free; dsl_deadlist_open(&ll, mos, ll_obj); dle = dsl_deadlist_first(&ll); ASSERT3P(dle, !=, NULL); bplist_create(&to_free); int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, z, NULL); if (err == 0) { sublist_delete_arg_t sync_arg = { .spa = spa, .ll = &ll, .key = dle->dle_mintxg, .to_free = &to_free }; zfs_dbgmsg("deleting sublist (id %llu) from" " livelist %llu, %d remaining", dle->dle_bpobj.bpo_object, ll_obj, count - 1); VERIFY0(dsl_sync_task(spa_name(spa), NULL, sublist_delete_sync, &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); } else { VERIFY3U(err, ==, EINTR); } bplist_clear(&to_free); bplist_destroy(&to_free); dsl_deadlist_close(&ll); } else { livelist_delete_arg_t sync_arg = { .spa = spa, .ll_obj = ll_obj, .zap_obj = zap_obj }; zfs_dbgmsg("deletion of livelist %llu completed", ll_obj); VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); } } static void spa_start_livelist_destroy_thread(spa_t *spa) { ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); spa->spa_livelist_delete_zthr = zthr_create("z_livelist_destroy", spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa); } typedef struct livelist_new_arg { bplist_t *allocs; bplist_t *frees; } livelist_new_arg_t; static int livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(tx == NULL); livelist_new_arg_t *lna = arg; if (bp_freed) { bplist_append(lna->frees, bp); } else { bplist_append(lna->allocs, bp); zfs_livelist_condense_new_alloc++; } return (0); } typedef struct livelist_condense_arg { spa_t *spa; bplist_t to_keep; uint64_t first_size; uint64_t next_size; } livelist_condense_arg_t; static void spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) { livelist_condense_arg_t *lca = arg; spa_t *spa = lca->spa; bplist_t new_frees; dsl_dataset_t *ds = spa->spa_to_condense.ds; /* Have we been cancelled? */ if (spa->spa_to_condense.cancelled) { zfs_livelist_condense_sync_cancel++; goto out; } dsl_deadlist_entry_t *first = spa->spa_to_condense.first; dsl_deadlist_entry_t *next = spa->spa_to_condense.next; dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; /* * It's possible that the livelist was changed while the zthr was * running. Therefore, we need to check for new blkptrs in the two * entries being condensed and continue to track them in the livelist. * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), * it's possible that the newly added blkptrs are FREEs or ALLOCs so * we need to sort them into two different bplists. */ uint64_t first_obj = first->dle_bpobj.bpo_object; uint64_t next_obj = next->dle_bpobj.bpo_object; uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; bplist_create(&new_frees); livelist_new_arg_t new_bps = { .allocs = &lca->to_keep, .frees = &new_frees, }; if (cur_first_size > lca->first_size) { VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, livelist_track_new_cb, &new_bps, lca->first_size)); } if (cur_next_size > lca->next_size) { VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, livelist_track_new_cb, &new_bps, lca->next_size)); } dsl_deadlist_clear_entry(first, ll, tx); ASSERT(bpobj_is_empty(&first->dle_bpobj)); dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); bplist_destroy(&new_frees); char dsname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds, dsname); zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " "(%llu blkptrs)", tx->tx_txg, dsname, ds->ds_object, first_obj, cur_first_size, next_obj, cur_next_size, first->dle_bpobj.bpo_object, first->dle_bpobj.bpo_phys->bpo_num_blkptrs); out: dmu_buf_rele(ds->ds_dbuf, spa); spa->spa_to_condense.ds = NULL; bplist_clear(&lca->to_keep); bplist_destroy(&lca->to_keep); kmem_free(lca, sizeof (livelist_condense_arg_t)); spa->spa_to_condense.syncing = B_FALSE; } static void spa_livelist_condense_cb(void *arg, zthr_t *t) { while (zfs_livelist_condense_zthr_pause && !(zthr_has_waiters(t) || zthr_iscancelled(t))) delay(1); spa_t *spa = arg; dsl_deadlist_entry_t *first = spa->spa_to_condense.first; dsl_deadlist_entry_t *next = spa->spa_to_condense.next; uint64_t first_size, next_size; livelist_condense_arg_t *lca = kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); bplist_create(&lca->to_keep); /* * Process the livelists (matching FREEs and ALLOCs) in open context * so we have minimal work in syncing context to condense. * * We save bpobj sizes (first_size and next_size) to use later in * syncing context to determine if entries were added to these sublists * while in open context. This is possible because the clone is still * active and open for normal writes and we want to make sure the new, * unprocessed blockpointers are inserted into the livelist normally. * * Note that dsl_process_sub_livelist() both stores the size number of * blockpointers and iterates over them while the bpobj's lock held, so * the sizes returned to us are consistent which what was actually * processed. */ int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, &first_size); if (err == 0) err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, t, &next_size); if (err == 0) { while (zfs_livelist_condense_sync_pause && !(zthr_has_waiters(t) || zthr_iscancelled(t))) delay(1); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); dmu_tx_mark_netfree(tx); dmu_tx_hold_space(tx, 1); err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); if (err == 0) { /* * Prevent the condense zthr restarting before * the synctask completes. */ spa->spa_to_condense.syncing = B_TRUE; lca->spa = spa; lca->first_size = first_size; lca->next_size = next_size; dsl_sync_task_nowait(spa_get_dsl(spa), spa_livelist_condense_sync, lca, 0, ZFS_SPACE_CHECK_NONE, tx); dmu_tx_commit(tx); return; } } /* * Condensing can not continue: either it was externally stopped or * we were unable to assign to a tx because the pool has run out of * space. In the second case, we'll just end up trying to condense * again in a later txg. */ ASSERT(err != 0); bplist_clear(&lca->to_keep); bplist_destroy(&lca->to_keep); kmem_free(lca, sizeof (livelist_condense_arg_t)); dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); spa->spa_to_condense.ds = NULL; if (err == EINTR) zfs_livelist_condense_zthr_cancel++; } /* ARGSUSED */ /* * Check that there is something to condense but that a condense is not * already in progress and that condensing has not been cancelled. */ static boolean_t spa_livelist_condense_cb_check(void *arg, zthr_t *z) { spa_t *spa = arg; if ((spa->spa_to_condense.ds != NULL) && (spa->spa_to_condense.syncing == B_FALSE) && (spa->spa_to_condense.cancelled == B_FALSE)) { return (B_TRUE); } return (B_FALSE); } static void spa_start_livelist_condensing_thread(spa_t *spa) { spa->spa_to_condense.ds = NULL; spa->spa_to_condense.first = NULL; spa->spa_to_condense.next = NULL; spa->spa_to_condense.syncing = B_FALSE; spa->spa_to_condense.cancelled = B_FALSE; ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); spa->spa_livelist_condense_zthr = zthr_create("z_livelist_condense", spa_livelist_condense_cb_check, spa_livelist_condense_cb, spa); } static void spa_spawn_aux_threads(spa_t *spa) { ASSERT(spa_writeable(spa)); ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_start_indirect_condensing_thread(spa); spa_start_livelist_destroy_thread(spa); spa_start_livelist_condensing_thread(spa); ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); spa->spa_checkpoint_discard_zthr = zthr_create("z_checkpoint_discard", spa_checkpoint_discard_thread_check, spa_checkpoint_discard_thread, spa); } /* * Fix up config after a partly-completed split. This is done with the * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off * pool have that entry in their config, but only the splitting one contains * a list of all the guids of the vdevs that are being split off. * * This function determines what to do with that list: either rejoin * all the disks to the pool, or complete the splitting process. To attempt * the rejoin, each disk that is offlined is marked online again, and * we do a reopen() call. If the vdev label for every disk that was * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) * then we call vdev_split() on each disk, and complete the split. * * Otherwise we leave the config alone, with all the vdevs in place in * the original pool. */ static void spa_try_repair(spa_t *spa, nvlist_t *config) { uint_t extracted; uint64_t *glist; uint_t i, gcount; nvlist_t *nvl; vdev_t **vd; boolean_t attempt_reopen; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) return; /* check that the config is complete */ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, &glist, &gcount) != 0) return; vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); /* attempt to online all the vdevs & validate */ attempt_reopen = B_TRUE; for (i = 0; i < gcount; i++) { if (glist[i] == 0) /* vdev is hole */ continue; vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); if (vd[i] == NULL) { /* * Don't bother attempting to reopen the disks; * just do the split. */ attempt_reopen = B_FALSE; } else { /* attempt to re-online it */ vd[i]->vdev_offline = B_FALSE; } } if (attempt_reopen) { vdev_reopen(spa->spa_root_vdev); /* check each device to see what state it's in */ for (extracted = 0, i = 0; i < gcount; i++) { if (vd[i] != NULL && vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) break; ++extracted; } } /* * If every disk has been moved to the new pool, or if we never * even attempted to look at them, then we split them off for * good. */ if (!attempt_reopen || gcount == extracted) { for (i = 0; i < gcount; i++) if (vd[i] != NULL) vdev_split(vd[i]); vdev_reopen(spa->spa_root_vdev); } kmem_free(vd, gcount * sizeof (vdev_t *)); } static int spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) { char *ereport = FM_EREPORT_ZFS_POOL; int error; spa->spa_load_state = state; (void) spa_import_progress_set_state(spa_guid(spa), spa_load_state(spa)); gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, type, &ereport); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); if (error) { if (error != EEXIST) { spa->spa_loaded_ts.tv_sec = 0; spa->spa_loaded_ts.tv_nsec = 0; } if (error != EBADF) { - zfs_ereport_post(ereport, spa, NULL, NULL, NULL, 0, 0); + (void) zfs_ereport_post(ereport, spa, + NULL, NULL, NULL, 0, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; spa->spa_ena = 0; (void) spa_import_progress_set_state(spa_guid(spa), spa_load_state(spa)); return (error); } #ifdef ZFS_DEBUG /* * Count the number of per-vdev ZAPs associated with all of the vdevs in the * vdev tree rooted in the given vd, and ensure that each ZAP is present in the * spa's per-vdev ZAP list. */ static uint64_t vdev_count_verify_zaps(vdev_t *vd) { spa_t *spa = vd->vdev_spa; uint64_t total = 0; if (vd->vdev_top_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_top_zap)); } if (vd->vdev_leaf_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); } for (uint64_t i = 0; i < vd->vdev_children; i++) { total += vdev_count_verify_zaps(vd->vdev_child[i]); } return (total); } #endif /* * Determine whether the activity check is required. */ static boolean_t spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, nvlist_t *config) { uint64_t state = 0; uint64_t hostid = 0; uint64_t tryconfig_txg = 0; uint64_t tryconfig_timestamp = 0; uint16_t tryconfig_mmp_seq = 0; nvlist_t *nvinfo; if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, &tryconfig_txg); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, &tryconfig_timestamp); (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, &tryconfig_mmp_seq); } (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); /* * Disable the MMP activity check - This is used by zdb which * is intended to be used on potentially active pools. */ if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) return (B_FALSE); /* * Skip the activity check when the MMP feature is disabled. */ if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) return (B_FALSE); /* * If the tryconfig_ values are nonzero, they are the results of an * earlier tryimport. If they all match the uberblock we just found, * then the pool has not changed and we return false so we do not test * a second time. */ if (tryconfig_txg && tryconfig_txg == ub->ub_txg && tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && tryconfig_mmp_seq && tryconfig_mmp_seq == (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) return (B_FALSE); /* * Allow the activity check to be skipped when importing the pool * on the same host which last imported it. Since the hostid from * configuration may be stale use the one read from the label. */ if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); if (hostid == spa_get_hostid(spa)) return (B_FALSE); /* * Skip the activity test when the pool was cleanly exported. */ if (state != POOL_STATE_ACTIVE) return (B_FALSE); return (B_TRUE); } /* * Nanoseconds the activity check must watch for changes on-disk. */ static uint64_t spa_activity_check_duration(spa_t *spa, uberblock_t *ub) { uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); uint64_t multihost_interval = MSEC2NSEC( MMP_INTERVAL_OK(zfs_multihost_interval)); uint64_t import_delay = MAX(NANOSEC, import_intervals * multihost_interval); /* * Local tunables determine a minimum duration except for the case * where we know when the remote host will suspend the pool if MMP * writes do not land. * * See Big Theory comment at the top of mmp.c for the reasoning behind * these cases and times. */ ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && MMP_FAIL_INT(ub) > 0) { /* MMP on remote host will suspend pool after failed writes */ import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * MMP_IMPORT_SAFETY_FACTOR / 100; zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " "mmp_fails=%llu ub_mmp mmp_interval=%llu " "import_intervals=%u", import_delay, MMP_FAIL_INT(ub), MMP_INTERVAL(ub), import_intervals); } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && MMP_FAIL_INT(ub) == 0) { /* MMP on remote host will never suspend pool */ import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + ub->ub_mmp_delay) * import_intervals); zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " "mmp_interval=%llu ub_mmp_delay=%llu " "import_intervals=%u", import_delay, MMP_INTERVAL(ub), ub->ub_mmp_delay, import_intervals); } else if (MMP_VALID(ub)) { /* * zfs-0.7 compatibility case */ import_delay = MAX(import_delay, (multihost_interval + ub->ub_mmp_delay) * import_intervals); zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " "import_intervals=%u leaves=%u", import_delay, ub->ub_mmp_delay, import_intervals, vdev_count_leaves(spa)); } else { /* Using local tunings is the only reasonable option */ zfs_dbgmsg("pool last imported on non-MMP aware " "host using import_delay=%llu multihost_interval=%llu " "import_intervals=%u", import_delay, multihost_interval, import_intervals); } return (import_delay); } /* * Perform the import activity check. If the user canceled the import or * we detected activity then fail. */ static int spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) { uint64_t txg = ub->ub_txg; uint64_t timestamp = ub->ub_timestamp; uint64_t mmp_config = ub->ub_mmp_config; uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; uint64_t import_delay; hrtime_t import_expire; nvlist_t *mmp_label = NULL; vdev_t *rvd = spa->spa_root_vdev; kcondvar_t cv; kmutex_t mtx; int error = 0; cv_init(&cv, NULL, CV_DEFAULT, NULL); mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); mutex_enter(&mtx); /* * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed * during the earlier tryimport. If the txg recorded there is 0 then * the pool is known to be active on another host. * * Otherwise, the pool might be in use on another host. Check for * changes in the uberblocks on disk if necessary. */ if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { vdev_uberblock_load(rvd, ub, &mmp_label); error = SET_ERROR(EREMOTEIO); goto out; } } import_delay = spa_activity_check_duration(spa, ub); /* Add a small random factor in case of simultaneous imports (0-25%) */ import_delay += import_delay * spa_get_random(250) / 1000; import_expire = gethrtime() + import_delay; while (gethrtime() < import_expire) { (void) spa_import_progress_set_mmp_check(spa_guid(spa), NSEC2SEC(import_expire - gethrtime())); vdev_uberblock_load(rvd, ub, &mmp_label); if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { zfs_dbgmsg("multihost activity detected " "txg %llu ub_txg %llu " "timestamp %llu ub_timestamp %llu " "mmp_config %#llx ub_mmp_config %#llx", txg, ub->ub_txg, timestamp, ub->ub_timestamp, mmp_config, ub->ub_mmp_config); error = SET_ERROR(EREMOTEIO); break; } if (mmp_label) { nvlist_free(mmp_label); mmp_label = NULL; } error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); if (error != -1) { error = SET_ERROR(EINTR); break; } error = 0; } out: mutex_exit(&mtx); mutex_destroy(&mtx); cv_destroy(&cv); /* * If the pool is determined to be active store the status in the * spa->spa_load_info nvlist. If the remote hostname or hostid are * available from configuration read from disk store them as well. * This allows 'zpool import' to generate a more useful message. * * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool */ if (error == EREMOTEIO) { char *hostname = ""; uint64_t hostid = 0; if (mmp_label) { if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { hostname = fnvlist_lookup_string(mmp_label, ZPOOL_CONFIG_HOSTNAME); fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_MMP_HOSTNAME, hostname); } if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { hostid = fnvlist_lookup_uint64(mmp_label, ZPOOL_CONFIG_HOSTID); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_HOSTID, hostid); } } fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_TXG, 0); error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); } if (mmp_label) nvlist_free(mmp_label); return (error); } static int spa_verify_host(spa_t *spa, nvlist_t *mos_config) { uint64_t hostid; char *hostname; uint64_t myhostid = 0; if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { hostname = fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME); myhostid = zone_get_hostid(NULL); if (hostid != 0 && myhostid != 0 && hostid != myhostid) { cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%llx). " "See: https://openzfs.github.io/openzfs-docs/msg/" "ZFS-8000-EY", spa_name(spa), hostname, (u_longlong_t)hostid); spa_load_failed(spa, "hostid verification failed: pool " "last accessed by host: %s (hostid: 0x%llx)", hostname, (u_longlong_t)hostid); return (SET_ERROR(EBADF)); } } return (0); } static int spa_ld_parse_config(spa_t *spa, spa_import_type_t type) { int error = 0; nvlist_t *nvtree, *nvl, *config = spa->spa_config; int parse; vdev_t *rvd; uint64_t pool_guid; char *comment; /* * Versioning wasn't explicitly added to the label until later, so if * it's not present treat it as the initial version. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &spa->spa_ubsync.ub_version) != 0) spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { spa_load_failed(spa, "invalid config provided: '%s' missing", ZPOOL_CONFIG_POOL_GUID); return (SET_ERROR(EINVAL)); } /* * If we are doing an import, ensure that the pool is not already * imported by checking if its pool guid already exists in the * spa namespace. * * The only case that we allow an already imported pool to be * imported again, is when the pool is checkpointed and we want to * look at its checkpointed state from userland tools like zdb. */ #ifdef _KERNEL if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) { #else if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0) && !spa_importing_readonly_checkpoint(spa)) { #endif spa_load_failed(spa, "a pool with guid %llu is already open", (u_longlong_t)pool_guid); return (SET_ERROR(EEXIST)); } spa->spa_config_guid = pool_guid; nvlist_free(spa->spa_load_info); spa->spa_load_info = fnvlist_alloc(); ASSERT(spa->spa_comment == NULL); if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) spa->spa_comment = spa_strdup(comment); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &spa->spa_config_txg); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) spa->spa_config_splitting = fnvlist_dup(nvl); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { spa_load_failed(spa, "invalid config provided: '%s' missing", ZPOOL_CONFIG_VDEV_TREE); return (SET_ERROR(EINVAL)); } /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Parse the configuration into a vdev tree. We explicitly set the * value that will be returned by spa_version() since parsing the * configuration requires knowing the version number. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); parse = (type == SPA_IMPORT_EXISTING ? VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "unable to parse config [error=%d]", error); return (error); } ASSERT(spa->spa_root_vdev == rvd); ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); if (type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_guid(spa) == pool_guid); } return (0); } /* * Recursively open all vdevs in the vdev tree. This function is called twice: * first with the untrusted config, then with the trusted config. */ static int spa_ld_open_vdevs(spa_t *spa) { int error = 0; /* * spa_missing_tvds_allowed defines how many top-level vdevs can be * missing/unopenable for the root vdev to be still considered openable. */ if (spa->spa_trust_config) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; } else { spa->spa_missing_tvds_allowed = 0; } spa->spa_missing_tvds_allowed = MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_open(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "vdev tree has %lld missing top-level " "vdevs.", (u_longlong_t)spa->spa_missing_tvds); if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { /* * Although theoretically we could allow users to open * incomplete pools in RW mode, we'd need to add a lot * of extra logic (e.g. adjust pool space to account * for missing vdevs). * This limitation also prevents users from accidentally * opening the pool in RW mode during data recovery and * damaging it further. */ spa_load_note(spa, "pools with missing top-level " "vdevs can only be opened in read-only mode."); error = SET_ERROR(ENXIO); } else { spa_load_note(spa, "current settings allow for maximum " "%lld missing top-level vdevs at this stage.", (u_longlong_t)spa->spa_missing_tvds_allowed); } } if (error != 0) { spa_load_failed(spa, "unable to open vdev tree [error=%d]", error); } if (spa->spa_missing_tvds != 0 || error != 0) vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); return (error); } /* * We need to validate the vdev labels against the configuration that * we have in hand. This function is called twice: first with an untrusted * config, then with a trusted config. The validation is more strict when the * config is trusted. */ static int spa_ld_validate_vdevs(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_validate(rvd); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "vdev_validate failed [error=%d]", error); return (error); } if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { spa_load_failed(spa, "cannot open vdev tree after invalidating " "some vdevs"); vdev_dbgmsg_print_tree(rvd, 2); return (SET_ERROR(ENXIO)); } return (0); } static void spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) { spa->spa_state = POOL_STATE_ACTIVE; spa->spa_ubsync = spa->spa_uberblock; spa->spa_verify_min_txg = spa->spa_extreme_rewind ? TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; spa->spa_first_txg = spa->spa_last_ubsync_txg ? spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; } static int spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) { vdev_t *rvd = spa->spa_root_vdev; nvlist_t *label; uberblock_t *ub = &spa->spa_uberblock; boolean_t activity_check = B_FALSE; /* * If we are opening the checkpointed state of the pool by * rewinding to it, at this point we will have written the * checkpointed uberblock to the vdev labels, so searching * the labels will find the right uberblock. However, if * we are opening the checkpointed state read-only, we have * not modified the labels. Therefore, we must ignore the * labels and continue using the spa_uberblock that was set * by spa_ld_checkpoint_rewind. * * Note that it would be fine to ignore the labels when * rewinding (opening writeable) as well. However, if we * crash just after writing the labels, we will end up * searching the labels. Doing so in the common case means * that this code path gets exercised normally, rather than * just in the edge case. */ if (ub->ub_checkpoint_txg != 0 && spa_importing_readonly_checkpoint(spa)) { spa_ld_select_uberblock_done(spa, ub); return (0); } /* * Find the best uberblock. */ vdev_uberblock_load(rvd, ub, &label); /* * If we weren't able to find a single valid uberblock, return failure. */ if (ub->ub_txg == 0) { nvlist_free(label); spa_load_failed(spa, "no valid uberblock found"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } if (spa->spa_load_max_txg != UINT64_MAX) { (void) spa_import_progress_set_max_txg(spa_guid(spa), (u_longlong_t)spa->spa_load_max_txg); } spa_load_note(spa, "using uberblock with txg=%llu", (u_longlong_t)ub->ub_txg); /* * For pools which have the multihost property on determine if the * pool is truly inactive and can be safely imported. Prevent * hosts which don't have a hostid set from importing the pool. */ activity_check = spa_activity_check_required(spa, ub, label, spa->spa_config); if (activity_check) { if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && spa_get_hostid(spa) == 0) { nvlist_free(label); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } int error = spa_activity_check(spa, ub, spa->spa_config); if (error) { nvlist_free(label); return (error); } fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); fnvlist_add_uint16(spa->spa_load_info, ZPOOL_CONFIG_MMP_SEQ, (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); } /* * If the pool has an unsupported version we can't open it. */ if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { nvlist_free(label); spa_load_failed(spa, "version %llu is not supported", (u_longlong_t)ub->ub_version); return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); } if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *features; /* * If we weren't able to find what's necessary for reading the * MOS in the label, return failure. */ if (label == NULL) { spa_load_failed(spa, "label config unavailable"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { nvlist_free(label); spa_load_failed(spa, "invalid label: '%s' missing", ZPOOL_CONFIG_FEATURES_FOR_READ); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } /* * Update our in-core representation with the definitive values * from the label. */ nvlist_free(spa->spa_label_features); VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); } nvlist_free(label); /* * Look through entries in the label nvlist's features_for_read. If * there is a feature listed there which we don't understand then we * cannot open a pool. */ if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *unsup_feat; VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, NULL); nvp != NULL; nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { if (!zfeature_is_supported(nvpair_name(nvp))) { VERIFY(nvlist_add_string(unsup_feat, nvpair_name(nvp), "") == 0); } } if (!nvlist_empty(unsup_feat)) { VERIFY(nvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); nvlist_free(unsup_feat); spa_load_failed(spa, "some features are unsupported"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } nvlist_free(unsup_feat); } if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_try_repair(spa, spa->spa_config); spa_config_exit(spa, SCL_ALL, FTAG); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; } /* * Initialize internal SPA structures. */ spa_ld_select_uberblock_done(spa, ub); return (0); } static int spa_ld_open_rootbp(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error != 0) { spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; return (0); } static int spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, boolean_t reloading) { vdev_t *mrvd, *rvd = spa->spa_root_vdev; nvlist_t *nv, *mos_config, *policy; int error = 0, copy_error; uint64_t healthy_tvds, healthy_tvds_mos; uint64_t mos_config_txg; if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * If we're assembling a pool from a split, the config provided is * already trusted so there is nothing to do. */ if (type == SPA_IMPORT_ASSEMBLE) return (0); healthy_tvds = spa_healthy_core_tvds(spa); if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { spa_load_failed(spa, "unable to retrieve MOS config"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * If we are doing an open, pool owner wasn't verified yet, thus do * the verification here. */ if (spa->spa_load_state == SPA_LOAD_OPEN) { error = spa_verify_host(spa, mos_config); if (error != 0) { nvlist_free(mos_config); return (error); } } nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Build a new vdev tree from the trusted config */ VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); /* * Vdev paths in the MOS may be obsolete. If the untrusted config was * obtained by scanning /dev/dsk, then it will have the right vdev * paths. We update the trusted MOS config with this information. * We first try to copy the paths with vdev_copy_path_strict, which * succeeds only when both configs have exactly the same vdev tree. * If that fails, we fall back to a more flexible method that has a * best effort policy. */ copy_error = vdev_copy_path_strict(rvd, mrvd); if (copy_error != 0 || spa_load_print_vdev_tree) { spa_load_note(spa, "provided vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); spa_load_note(spa, "MOS vdev tree:"); vdev_dbgmsg_print_tree(mrvd, 2); } if (copy_error != 0) { spa_load_note(spa, "vdev_copy_path_strict failed, falling " "back to vdev_copy_path_relaxed"); vdev_copy_path_relaxed(rvd, mrvd); } vdev_close(rvd); vdev_free(rvd); spa->spa_root_vdev = mrvd; rvd = mrvd; spa_config_exit(spa, SCL_ALL, FTAG); /* * We will use spa_config if we decide to reload the spa or if spa_load * fails and we rewind. We must thus regenerate the config using the * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to * pass settings on how to load the pool and is not stored in the MOS. * We copy it over to our new, trusted config. */ mos_config_txg = fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_POOL_TXG); nvlist_free(mos_config); mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, &policy) == 0) fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); spa_config_set(spa, mos_config); spa->spa_config_source = SPA_CONFIG_SRC_MOS; /* * Now that we got the config from the MOS, we should be more strict * in checking blkptrs and can make assumptions about the consistency * of the vdev tree. spa_trust_config must be set to true before opening * vdevs in order for them to be writeable. */ spa->spa_trust_config = B_TRUE; /* * Open and validate the new vdev tree */ error = spa_ld_open_vdevs(spa); if (error != 0) return (error); error = spa_ld_validate_vdevs(spa); if (error != 0) return (error); if (copy_error != 0 || spa_load_print_vdev_tree) { spa_load_note(spa, "final vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); } if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { /* * Sanity check to make sure that we are indeed loading the * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds * in the config provided and they happened to be the only ones * to have the latest uberblock, we could involuntarily perform * an extreme rewind. */ healthy_tvds_mos = spa_healthy_core_tvds(spa); if (healthy_tvds_mos - healthy_tvds >= SPA_SYNC_MIN_VDEVS) { spa_load_note(spa, "config provided misses too many " "top-level vdevs compared to MOS (%lld vs %lld). ", (u_longlong_t)healthy_tvds, (u_longlong_t)healthy_tvds_mos); spa_load_note(spa, "vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); if (reloading) { spa_load_failed(spa, "config was already " "provided from MOS. Aborting."); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_load_note(spa, "spa must be reloaded using MOS " "config"); return (SET_ERROR(EAGAIN)); } } error = spa_check_for_missing_logs(spa); if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { spa_load_failed(spa, "uberblock guid sum doesn't match MOS " "guid sum (%llu != %llu)", (u_longlong_t)spa->spa_uberblock.ub_guid_sum, (u_longlong_t)rvd->vdev_guid_sum); return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); } return (0); } static int spa_ld_open_indirect_vdev_metadata(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * Everything that we read before spa_remove_init() must be stored * on concreted vdevs. Therefore we do this as early as possible. */ error = spa_remove_init(spa); if (error != 0) { spa_load_failed(spa, "spa_remove_init failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Retrieve information needed to condense indirect vdev mappings. */ error = spa_condense_init(spa); if (error != 0) { spa_load_failed(spa, "spa_condense_init failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } return (0); } static int spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; if (spa_version(spa) >= SPA_VERSION_FEATURES) { boolean_t missing_feat_read = B_FALSE; nvlist_t *unsup_feat, *enabled_feat; if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, &spa->spa_feat_for_read_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, &spa->spa_feat_for_write_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, &spa->spa_feat_desc_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } enabled_feat = fnvlist_alloc(); unsup_feat = fnvlist_alloc(); if (!spa_features_check(spa, B_FALSE, unsup_feat, enabled_feat)) missing_feat_read = B_TRUE; if (spa_writeable(spa) || spa->spa_load_state == SPA_LOAD_TRYIMPORT) { if (!spa_features_check(spa, B_TRUE, unsup_feat, enabled_feat)) { *missing_feat_writep = B_TRUE; } } fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); if (!nvlist_empty(unsup_feat)) { fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); } fnvlist_free(enabled_feat); fnvlist_free(unsup_feat); if (!missing_feat_read) { fnvlist_add_boolean(spa->spa_load_info, ZPOOL_CONFIG_CAN_RDONLY); } /* * If the state is SPA_LOAD_TRYIMPORT, our objective is * twofold: to determine whether the pool is available for * import in read-write mode and (if it is not) whether the * pool is available for import in read-only mode. If the pool * is available for import in read-write mode, it is displayed * as available in userland; if it is not available for import * in read-only mode, it is displayed as unavailable in * userland. If the pool is available for import in read-only * mode but not read-write mode, it is displayed as unavailable * in userland with a special note that the pool is actually * available for open in read-only mode. * * As a result, if the state is SPA_LOAD_TRYIMPORT and we are * missing a feature for write, we must first determine whether * the pool can be opened read-only before returning to * userland in order to know whether to display the * abovementioned note. */ if (missing_feat_read || (*missing_feat_writep && spa_writeable(spa))) { spa_load_failed(spa, "pool uses unsupported features"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * Load refcounts for ZFS features from disk into an in-memory * cache during SPA initialization. */ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { uint64_t refcount; error = feature_get_refcount_from_disk(spa, &spa_feature_table[i], &refcount); if (error == 0) { spa->spa_feat_refcount_cache[i] = refcount; } else if (error == ENOTSUP) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } else { spa_load_failed(spa, "error getting refcount " "for feature %s [error=%d]", spa_feature_table[i].fi_guid, error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } } } if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Encryption was added before bookmark_v2, even though bookmark_v2 * is now a dependency. If this pool has encryption enabled without * bookmark_v2, trigger an errata message. */ if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; } return (0); } static int spa_ld_load_special_directories(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; spa->spa_is_initializing = B_TRUE; error = dsl_pool_open(spa->spa_dsl_pool); spa->spa_is_initializing = B_FALSE; if (error != 0) { spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_get_props(spa_t *spa) { int error = 0; uint64_t obj; vdev_t *rvd = spa->spa_root_vdev; /* Grab the checksum salt from the MOS. */ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes); if (error == ENOENT) { /* Generate a new salt for subsequent use */ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); } else if (error != 0) { spa_load_failed(spa, "unable to retrieve checksum salt from " "MOS [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); if (error != 0) { spa_load_failed(spa, "error opening deferred-frees bpobj " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Load the bit that tells us to use the new accounting function * (raid-z deflation). If we have an older pool, this will not * be present. */ error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, &spa->spa_creation_version, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the persistent error log. If we have an older pool, this will * not be present. */ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, &spa->spa_errlog_scrub, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the livelist deletion field. If a livelist is queued for * deletion, indicate that in the spa */ error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, &spa->spa_livelists_to_delete, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the history object. If we have an older pool, this * will not be present. */ error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the per-vdev ZAP map. If we have an older pool, this will not * be present; in this case, defer its creation to a later time to * avoid dirtying the MOS this early / out of sync context. See * spa_sync_config_object. */ /* The sentinel is only available in the MOS config. */ nvlist_t *mos_config; if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { spa_load_failed(spa, "unable to retrieve MOS config"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, &spa->spa_all_vdev_zaps, B_FALSE); if (error == ENOENT) { VERIFY(!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); spa->spa_avz_action = AVZ_ACTION_INITIALIZE; ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } else if (error != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { /* * An older version of ZFS overwrote the sentinel value, so * we have orphaned per-vdev ZAPs in the MOS. Defer their * destruction to later; see spa_sync_config_object. */ spa->spa_avz_action = AVZ_ACTION_DESTROY; /* * We're assuming that no vdevs have had their ZAPs created * before this. Better be sure of it. */ ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } nvlist_free(mos_config); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, B_FALSE); if (error && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0) { uint64_t autoreplace; spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); spa->spa_autoreplace = (autoreplace != 0); } /* * If we are importing a pool with missing top-level vdevs, * we enforce that the pool doesn't panic or get suspended on * error since the likelihood of missing data is extremely high. */ if (spa->spa_missing_tvds > 0 && spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { spa_load_note(spa, "forcing failmode to 'continue' " "as some top level vdevs are missing"); spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; } return (0); } static int spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * If we're assembling the pool from the split-off vdevs of * an existing pool, we don't want to attach the spares & cache * devices. */ /* * Load any hot spares for this pool. */ error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); if (load_nvlist(spa, spa->spa_spares.sav_object, &spa->spa_spares.sav_config) != 0) { spa_load_failed(spa, "error loading spares nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_spares.sav_sync = B_TRUE; } /* * Load any level 2 ARC devices for this pool. */ error = spa_dir_prop(spa, DMU_POOL_L2CACHE, &spa->spa_l2cache.sav_object, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); if (load_nvlist(spa, spa->spa_l2cache.sav_object, &spa->spa_l2cache.sav_config) != 0) { spa_load_failed(spa, "error loading l2cache nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_l2cache.sav_sync = B_TRUE; } return (0); } static int spa_ld_load_vdev_metadata(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * If the 'multihost' property is set, then never allow a pool to * be imported when the system hostid is zero. The exception to * this rule is zdb which is always allowed to access pools. */ if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } /* * If the 'autoreplace' property is set, then post a resource notifying * the ZFS DE that it should not issue any faults for unopenable * devices. We also iterate over the vdevs, and post a sysevent for any * unopenable vdevs so that the normal autoreplace handler can take * over. */ if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { spa_check_removed(spa->spa_root_vdev); /* * For the import case, this is done in spa_import(), because * at this point we're using the spare definitions from * the MOS config, not necessarily from the userland config. */ if (spa->spa_load_state != SPA_LOAD_IMPORT) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } } /* * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. */ error = vdev_load(rvd); if (error != 0) { spa_load_failed(spa, "vdev_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } error = spa_ld_log_spacemaps(spa); if (error != 0) { spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } /* * Propagate the leaf DTLs we just loaded all the way up the vdev tree. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); spa_config_exit(spa, SCL_ALL, FTAG); return (0); } static int spa_ld_load_dedup_tables(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = ddt_load(spa); if (error != 0) { spa_load_failed(spa, "ddt_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport) { vdev_t *rvd = spa->spa_root_vdev; if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { boolean_t missing = spa_check_logs(spa); if (missing) { if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "spa_check_logs failed " "so dropping the logs"); } else { *ereport = FM_EREPORT_ZFS_LOG_REPLAY; spa_load_failed(spa, "spa_check_logs failed"); return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); } } } return (0); } static int spa_ld_verify_pool_data(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * We've successfully opened the pool, verify that we're ready * to start pushing transactions. */ if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { error = spa_load_verify(spa); if (error != 0) { spa_load_failed(spa, "spa_load_verify failed " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } } return (0); } static void spa_ld_claim_log_blocks(spa_t *spa) { dmu_tx_t *tx; dsl_pool_t *dp = spa_get_dsl(spa); /* * Claim log blocks that haven't been committed yet. * This must all happen in a single txg. * Note: spa_claim_max_txg is updated by spa_claim_notify(), * invoked from zil_claim_log_block()'s i/o done callback. * Price of rollback is that we abandon the log. */ spa->spa_claiming = B_TRUE; tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); spa->spa_claiming = B_FALSE; spa_set_log_state(spa, SPA_LOG_GOOD); } static void spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, boolean_t update_config_cache) { vdev_t *rvd = spa->spa_root_vdev; int need_update = B_FALSE; /* * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. * * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ if (update_config_cache || config_cache_txg != spa->spa_config_txg || spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_RECOVER || (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) need_update = B_TRUE; for (int c = 0; c < rvd->vdev_children; c++) if (rvd->vdev_child[c]->vdev_ms_array == 0) need_update = B_TRUE; /* * Update the config cache asynchronously in case we're the * root pool, in which case the config cache isn't writable yet. */ if (need_update) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } static void spa_ld_prepare_for_reload(spa_t *spa) { spa_mode_t mode = spa->spa_mode; int async_suspended = spa->spa_async_suspended; spa_unload(spa); spa_deactivate(spa); spa_activate(spa, mode); /* * We save the value of spa_async_suspended as it gets reset to 0 by * spa_unload(). We want to restore it back to the original value before * returning as we might be calling spa_async_resume() later. */ spa->spa_async_suspended = async_suspended; } static int spa_ld_read_checkpoint_txg(spa_t *spa) { uberblock_t checkpoint; int error = 0; ASSERT0(spa->spa_checkpoint_txg); ASSERT(MUTEX_HELD(&spa_namespace_lock)); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error == ENOENT) return (0); if (error != 0) return (error); ASSERT3U(checkpoint.ub_txg, !=, 0); ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); ASSERT3U(checkpoint.ub_timestamp, !=, 0); spa->spa_checkpoint_txg = checkpoint.ub_txg; spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; return (0); } static int spa_ld_mos_init(spa_t *spa, spa_import_type_t type) { int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); /* * Never trust the config that is provided unless we are assembling * a pool following a split. * This means don't trust blkptrs and the vdev tree in general. This * also effectively puts the spa in read-only mode since * spa_writeable() checks for spa_trust_config to be true. * We will later load a trusted config from the MOS. */ if (type != SPA_IMPORT_ASSEMBLE) spa->spa_trust_config = B_FALSE; /* * Parse the config provided to create a vdev tree. */ error = spa_ld_parse_config(spa, type); if (error != 0) return (error); spa_import_progress_add(spa); /* * Now that we have the vdev tree, try to open each vdev. This involves * opening the underlying physical device, retrieving its geometry and * probing the vdev with a dummy I/O. The state of each vdev will be set * based on the success of those operations. After this we'll be ready * to read from the vdevs. */ error = spa_ld_open_vdevs(spa); if (error != 0) return (error); /* * Read the label of each vdev and make sure that the GUIDs stored * there match the GUIDs in the config provided. * If we're assembling a new pool that's been split off from an * existing pool, the labels haven't yet been updated so we skip * validation for now. */ if (type != SPA_IMPORT_ASSEMBLE) { error = spa_ld_validate_vdevs(spa); if (error != 0) return (error); } /* * Read all vdev labels to find the best uberblock (i.e. latest, * unless spa_load_max_txg is set) and store it in spa_uberblock. We * get the list of features required to read blkptrs in the MOS from * the vdev label with the best uberblock and verify that our version * of zfs supports them all. */ error = spa_ld_select_uberblock(spa, type); if (error != 0) return (error); /* * Pass that uberblock to the dsl_pool layer which will open the root * blkptr. This blkptr points to the latest version of the MOS and will * allow us to read its contents. */ error = spa_ld_open_rootbp(spa); if (error != 0) return (error); return (0); } static int spa_ld_checkpoint_rewind(spa_t *spa) { uberblock_t checkpoint; int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error != 0) { spa_load_failed(spa, "unable to retrieve checkpointed " "uberblock from the MOS config [error=%d]", error); if (error == ENOENT) error = ZFS_ERR_NO_CHECKPOINT; return (error); } ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); /* * We need to update the txg and timestamp of the checkpointed * uberblock to be higher than the latest one. This ensures that * the checkpointed uberblock is selected if we were to close and * reopen the pool right after we've written it in the vdev labels. * (also see block comment in vdev_uberblock_compare) */ checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; checkpoint.ub_timestamp = gethrestime_sec(); /* * Set current uberblock to be the checkpointed uberblock. */ spa->spa_uberblock = checkpoint; /* * If we are doing a normal rewind, then the pool is open for * writing and we sync the "updated" checkpointed uberblock to * disk. Once this is done, we've basically rewound the whole * pool and there is no way back. * * There are cases when we don't want to attempt and sync the * checkpointed uberblock to disk because we are opening a * pool as read-only. Specifically, verifying the checkpointed * state with zdb, and importing the checkpointed state to get * a "preview" of its content. */ if (spa_writeable(spa)) { vdev_t *rvd = spa->spa_root_vdev; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; int c0 = spa_get_random(children); for (int c = 0; c < children; c++) { vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; /* Stop when revisiting the first vdev */ if (c > 0 && svd[0] == vd) break; if (vd->vdev_ms_array == 0 || vd->vdev_islog || !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_SYNC_MIN_VDEVS) break; } error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "failed to write checkpointed " "uberblock to the vdev labels [error=%d]", error); return (error); } } return (0); } static int spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, boolean_t *update_config_cache) { int error; /* * Parse the config for pool, open and validate vdevs, * select an uberblock, and use that uberblock to open * the MOS. */ error = spa_ld_mos_init(spa, type); if (error != 0) return (error); /* * Retrieve the trusted config stored in the MOS and use it to create * a new, exact version of the vdev tree, then reopen all vdevs. */ error = spa_ld_trusted_config(spa, type, B_FALSE); if (error == EAGAIN) { if (update_config_cache != NULL) *update_config_cache = B_TRUE; /* * Redo the loading process with the trusted config if it is * too different from the untrusted config. */ spa_ld_prepare_for_reload(spa); spa_load_note(spa, "RELOADING"); error = spa_ld_mos_init(spa, type); if (error != 0) return (error); error = spa_ld_trusted_config(spa, type, B_TRUE); if (error != 0) return (error); } else if (error != 0) { return (error); } return (0); } /* * Load an existing storage pool, using the config provided. This config * describes which vdevs are part of the pool and is later validated against * partial configs present in each vdev's label and an entire copy of the * config stored in the MOS. */ static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) { int error = 0; boolean_t missing_feat_write = B_FALSE; boolean_t checkpoint_rewind = (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); boolean_t update_config_cache = B_FALSE; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); spa_load_note(spa, "LOADING"); error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); if (error != 0) return (error); /* * If we are rewinding to the checkpoint then we need to repeat * everything we've done so far in this function but this time * selecting the checkpointed uberblock and using that to open * the MOS. */ if (checkpoint_rewind) { /* * If we are rewinding to the checkpoint update config cache * anyway. */ update_config_cache = B_TRUE; /* * Extract the checkpointed uberblock from the current MOS * and use this as the pool's uberblock from now on. If the * pool is imported as writeable we also write the checkpoint * uberblock to the labels, making the rewind permanent. */ error = spa_ld_checkpoint_rewind(spa); if (error != 0) return (error); /* * Redo the loading process again with the * checkpointed uberblock. */ spa_ld_prepare_for_reload(spa); spa_load_note(spa, "LOADING checkpointed uberblock"); error = spa_ld_mos_with_trusted_config(spa, type, NULL); if (error != 0) return (error); } /* * Retrieve the checkpoint txg if the pool has a checkpoint. */ error = spa_ld_read_checkpoint_txg(spa); if (error != 0) return (error); /* * Retrieve the mapping of indirect vdevs. Those vdevs were removed * from the pool and their contents were re-mapped to other vdevs. Note * that everything that we read before this step must have been * rewritten on concrete vdevs after the last device removal was * initiated. Otherwise we could be reading from indirect vdevs before * we have loaded their mappings. */ error = spa_ld_open_indirect_vdev_metadata(spa); if (error != 0) return (error); /* * Retrieve the full list of active features from the MOS and check if * they are all supported. */ error = spa_ld_check_features(spa, &missing_feat_write); if (error != 0) return (error); /* * Load several special directories from the MOS needed by the dsl_pool * layer. */ error = spa_ld_load_special_directories(spa); if (error != 0) return (error); /* * Retrieve pool properties from the MOS. */ error = spa_ld_get_props(spa); if (error != 0) return (error); /* * Retrieve the list of auxiliary devices - cache devices and spares - * and open them. */ error = spa_ld_open_aux_vdevs(spa, type); if (error != 0) return (error); /* * Load the metadata for all vdevs. Also check if unopenable devices * should be autoreplaced. */ error = spa_ld_load_vdev_metadata(spa); if (error != 0) return (error); error = spa_ld_load_dedup_tables(spa); if (error != 0) return (error); /* * Verify the logs now to make sure we don't have any unexpected errors * when we claim log blocks later. */ error = spa_ld_verify_logs(spa, type, ereport); if (error != 0) return (error); if (missing_feat_write) { ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); /* * At this point, we know that we can open the pool in * read-only mode but not read-write mode. We now have enough * information and can return to userland. */ return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * Traverse the last txgs to make sure the pool was left off in a safe * state. When performing an extreme rewind, we verify the whole pool, * which can take a very long time. */ error = spa_ld_verify_pool_data(spa); if (error != 0) return (error); /* * Calculate the deflated space for the pool. This must be done before * we write anything to the pool because we'd need to update the space * accounting using the deflated sizes. */ spa_update_dspace(spa); /* * We have now retrieved all the information we needed to open the * pool. If we are importing the pool in read-write mode, a few * additional steps must be performed to finish the import. */ if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { uint64_t config_cache_txg = spa->spa_config_txg; ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); /* * In case of a checkpoint rewind, log the original txg * of the checkpointed uberblock. */ if (checkpoint_rewind) { spa_history_log_internal(spa, "checkpoint rewind", NULL, "rewound state to txg=%llu", (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); } /* * Traverse the ZIL and claim all blocks. */ spa_ld_claim_log_blocks(spa); /* * Kick-off the syncing thread. */ spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); mmp_thread_start(spa); /* * Wait for all claims to sync. We sync up to the highest * claimed log block birth time so that claimed log blocks * don't appear to be from the future. spa_claim_max_txg * will have been set for us by ZIL traversal operations * performed above. */ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* * Check if we need to request an update of the config. On the * next sync, we would update the config stored in vdev labels * and the cachefile (by default /etc/zfs/zpool.cache). */ spa_ld_check_for_config_update(spa, config_cache_txg, update_config_cache); /* * Check if a rebuild was in progress and if so resume it. * Then check all DTLs to see if anything needs resilvering. * The resilver will be deferred if a rebuild was started. */ if (vdev_rebuild_active(spa->spa_root_vdev)) { vdev_rebuild_restart(spa); } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER); } /* * Log the fact that we booted up (so that we can detect if * we rebooted in the middle of an operation). */ spa_history_log_version(spa, "open", NULL); spa_restart_removal(spa); spa_spawn_aux_threads(spa); /* * Delete any inconsistent datasets. * * Note: * Since we may be issuing deletes for clones here, * we make sure to do so after we've spawned all the * auxiliary threads above (from which the livelist * deletion zthr is part of). */ (void) dmu_objset_find(spa_name(spa), dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); /* * Clean up any stale temporary dataset userrefs. */ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_initialize_restart(spa->spa_root_vdev); vdev_trim_restart(spa->spa_root_vdev); vdev_autotrim_restart(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_import_progress_remove(spa_guid(spa)); spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_load_note(spa, "LOADED"); return (0); } static int spa_load_retry(spa_t *spa, spa_load_state_t state) { spa_mode_t mode = spa->spa_mode; spa_unload(spa); spa_deactivate(spa); spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; spa_activate(spa, mode); spa_async_suspend(spa); spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", (u_longlong_t)spa->spa_load_max_txg); return (spa_load(spa, state, SPA_IMPORT_EXISTING)); } /* * If spa_load() fails this function will try loading prior txg's. If * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this * function will not rewind the pool and will return the same error as * spa_load(). */ static int spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, int rewind_flags) { nvlist_t *loadinfo = NULL; nvlist_t *config = NULL; int load_error, rewind_error; uint64_t safe_rewind_txg; uint64_t min_txg; if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { spa->spa_load_max_txg = spa->spa_load_txg; spa_set_log_state(spa, SPA_LOG_CLEAR); } else { spa->spa_load_max_txg = max_request; if (max_request != UINT64_MAX) spa->spa_extreme_rewind = B_TRUE; } load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); if (load_error == 0) return (0); if (load_error == ZFS_ERR_NO_CHECKPOINT) { /* * When attempting checkpoint-rewind on a pool with no * checkpoint, we should not attempt to load uberblocks * from previous txgs when spa_load fails. */ ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); spa_import_progress_remove(spa_guid(spa)); return (load_error); } if (spa->spa_root_vdev != NULL) config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; if (rewind_flags & ZPOOL_NEVER_REWIND) { nvlist_free(config); spa_import_progress_remove(spa_guid(spa)); return (load_error); } if (state == SPA_LOAD_RECOVER) { /* Price of rolling back is discarding txgs, including log */ spa_set_log_state(spa, SPA_LOG_CLEAR); } else { /* * If we aren't rolling back save the load info from our first * import attempt so that we can restore it after attempting * to rewind. */ loadinfo = spa->spa_load_info; spa->spa_load_info = fnvlist_alloc(); } spa->spa_load_max_txg = spa->spa_last_ubsync_txg; safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? TXG_INITIAL : safe_rewind_txg; /* * Continue as long as we're finding errors, we're still within * the acceptable rewind range, and we're still finding uberblocks */ while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { if (spa->spa_load_max_txg < safe_rewind_txg) spa->spa_extreme_rewind = B_TRUE; rewind_error = spa_load_retry(spa, state); } spa->spa_extreme_rewind = B_FALSE; spa->spa_load_max_txg = UINT64_MAX; if (config && (rewind_error || state != SPA_LOAD_RECOVER)) spa_config_set(spa, config); else nvlist_free(config); if (state == SPA_LOAD_RECOVER) { ASSERT3P(loadinfo, ==, NULL); spa_import_progress_remove(spa_guid(spa)); return (rewind_error); } else { /* Store the rewind info as part of the initial load info */ fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, spa->spa_load_info); /* Restore the initial load info */ fnvlist_free(spa->spa_load_info); spa->spa_load_info = loadinfo; spa_import_progress_remove(spa_guid(spa)); return (load_error); } } /* * Pool Open/Import * * The import case is identical to an open except that the configuration is sent * down from userland, instead of grabbed from the configuration cache. For the * case of an open, the pool configuration will exist in the * POOL_STATE_UNINITIALIZED state. * * The stats information (gen/count/ustats) is used to gather vdev statistics at * the same time open the pool, without having to keep around the spa_t in some * ambiguous state. */ static int spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, nvlist_t **config) { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; int firstopen = B_FALSE; *spapp = NULL; /* * As disgusting as this is, we need to support recursive calls to this * function because dsl_dir_open() is called during spa_load(), and ends * up calling spa_open() again. The real fix is to figure out how to * avoid dsl_dir_open() calling this in the first place. */ if (MUTEX_NOT_HELD(&spa_namespace_lock)) { mutex_enter(&spa_namespace_lock); locked = B_TRUE; } if ((spa = spa_lookup(pool)) == NULL) { if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (spa->spa_state == POOL_STATE_UNINITIALIZED) { zpool_load_policy_t policy; firstopen = B_TRUE; zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, &policy); if (policy.zlp_rewind & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa_activate(spa, spa_mode_global); if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; zfs_dbgmsg("spa_open_common: opening %s", pool); error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); if (error == EBADF) { /* * If vdev_validate() returns failure (indicated by * EBADF), it indicates that one of the vdevs indicates * that the pool has been exported or destroyed. If * this is the case, the config cache is out of sync and * we should remove the pool from the namespace. */ spa_unload(spa); spa_deactivate(spa); spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (error) { /* * We can't open the pool, but we still have useful * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ if (config != NULL && spa->spa_config) { VERIFY(nvlist_dup(spa->spa_config, config, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); } spa_unload(spa); spa_deactivate(spa); spa->spa_last_open_failed = error; if (locked) mutex_exit(&spa_namespace_lock); *spapp = NULL; return (error); } } spa_open_ref(spa, tag); if (config != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); /* * If we've recovered the pool, pass back any information we * gathered while doing the load. */ if (state == SPA_LOAD_RECOVER) { VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); } if (locked) { spa->spa_last_open_failed = 0; spa->spa_last_ubsync_txg = 0; spa->spa_load_txg = 0; mutex_exit(&spa_namespace_lock); } if (firstopen) zvol_create_minors_recursive(spa_name(spa)); *spapp = spa; return (0); } int spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, nvlist_t **config) { return (spa_open_common(name, spapp, tag, policy, config)); } int spa_open(const char *name, spa_t **spapp, void *tag) { return (spa_open_common(name, spapp, tag, NULL, NULL)); } /* * Lookup the given spa_t, incrementing the inject count in the process, * preventing it from being exported or destroyed. */ spa_t * spa_inject_addref(char *name) { spa_t *spa; mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(name)) == NULL) { mutex_exit(&spa_namespace_lock); return (NULL); } spa->spa_inject_ref++; mutex_exit(&spa_namespace_lock); return (spa); } void spa_inject_delref(spa_t *spa) { mutex_enter(&spa_namespace_lock); spa->spa_inject_ref--; mutex_exit(&spa_namespace_lock); } /* * Add spares device information to the nvlist. */ static void spa_add_spares(spa_t *spa, nvlist_t *config) { nvlist_t **spares; uint_t i, nspares; nvlist_t *nvroot; uint64_t guid; vdev_stat_t *vs; uint_t vsc; uint64_t pool; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_spares.sav_count == 0) return; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); if (nspares != 0) { VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); /* * Go through and find any spares which have since been * repurposed as an active spare. If this is the case, update * their status appropriately. */ for (i = 0; i < nspares; i++) { VERIFY(nvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID, &guid) == 0); if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { VERIFY(nvlist_lookup_uint64_array( spares[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; } } } } /* * Add l2cache device information to the nvlist, including vdev stats. */ static void spa_add_l2cache(spa_t *spa, nvlist_t *config) { nvlist_t **l2cache; uint_t i, j, nl2cache; nvlist_t *nvroot; uint64_t guid; vdev_t *vd; vdev_stat_t *vs; uint_t vsc; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_l2cache.sav_count == 0) return; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); if (nl2cache != 0) { VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); /* * Update level 2 cache device stats. */ for (i = 0; i < nl2cache; i++) { VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, &guid) == 0); vd = NULL; for (j = 0; j < spa->spa_l2cache.sav_count; j++) { if (guid == spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { vd = spa->spa_l2cache.sav_vdevs[j]; break; } } ASSERT(vd != NULL); VERIFY(nvlist_lookup_uint64_array(l2cache[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vdev_get_stats(vd, vs); vdev_config_generate_stats(vd, l2cache[i]); } } } static void spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) { zap_cursor_t zc; zap_attribute_t za; if (spa->spa_feat_for_read_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_read_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); VERIFY0(nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } if (spa->spa_feat_for_write_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_write_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); VERIFY0(nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } } static void spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) { int i; for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t feature = spa_feature_table[i]; uint64_t refcount; if (feature_get_refcount(spa, &feature, &refcount) != 0) continue; VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); } } /* * Store a list of pool features and their reference counts in the * config. * * The first time this is called on a spa, allocate a new nvlist, fetch * the pool features and reference counts from disk, then save the list * in the spa. In subsequent calls on the same spa use the saved nvlist * and refresh its values from the cached reference counts. This * ensures we don't block here on I/O on a suspended pool so 'zpool * clear' can resume the pool. */ static void spa_add_feature_stats(spa_t *spa, nvlist_t *config) { nvlist_t *features; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); mutex_enter(&spa->spa_feat_stats_lock); features = spa->spa_feat_stats; if (features != NULL) { spa_feature_stats_from_cache(spa, features); } else { VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); spa->spa_feat_stats = features; spa_feature_stats_from_disk(spa, features); } VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, features)); mutex_exit(&spa->spa_feat_stats_lock); } int spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) { int error; spa_t *spa; *config = NULL; error = spa_open_common(name, &spa, FTAG, NULL, config); if (spa != NULL) { /* * This still leaves a window of inconsistency where the spares * or l2cache devices could change and the config would be * self-inconsistent. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if (*config != NULL) { uint64_t loadtimes[2]; loadtimes[0] = spa->spa_loaded_ts.tv_sec; loadtimes[1] = spa->spa_loaded_ts.tv_nsec; VERIFY(nvlist_add_uint64_array(*config, ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, spa_get_errlog_size(spa)) == 0); if (spa_suspended(spa)) { VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0); VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED_REASON, spa->spa_suspended) == 0); } spa_add_spares(spa, *config); spa_add_l2cache(spa, *config); spa_add_feature_stats(spa, *config); } } /* * We want to get the alternate root even for faulted pools, so we cheat * and call spa_lookup() directly. */ if (altroot) { if (spa == NULL) { mutex_enter(&spa_namespace_lock); spa = spa_lookup(name); if (spa) spa_altroot(spa, altroot, buflen); else altroot[0] = '\0'; spa = NULL; mutex_exit(&spa_namespace_lock); } else { spa_altroot(spa, altroot, buflen); } } if (spa != NULL) { spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); } return (error); } /* * Validate that the auxiliary device array is well formed. We must have an * array of nvlists, each which describes a valid leaf vdev. If this is an * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be * specified, as long as they are well-formed. */ static int spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, spa_aux_vdev_t *sav, const char *config, uint64_t version, vdev_labeltype_t label) { nvlist_t **dev; uint_t i, ndev; vdev_t *vd; int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * It's acceptable to have no devs specified. */ if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) return (0); if (ndev == 0) return (SET_ERROR(EINVAL)); /* * Make sure the pool is formatted with a version that supports this * device type. */ if (spa_version(spa) < version) return (SET_ERROR(ENOTSUP)); /* * Set the pending device list so we correctly handle device in-use * checking. */ sav->sav_pending = dev; sav->sav_npending = ndev; for (i = 0; i < ndev; i++) { if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, mode)) != 0) goto out; if (!vd->vdev_ops->vdev_op_leaf) { vdev_free(vd); error = SET_ERROR(EINVAL); goto out; } vd->vdev_top = vd; if ((error = vdev_open(vd)) == 0 && (error = vdev_label_init(vd, crtxg, label)) == 0) { VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); } vdev_free(vd); if (error && (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) goto out; else error = 0; } out: sav->sav_pending = NULL; sav->sav_npending = 0; return (error); } static int spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) { int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, VDEV_LABEL_SPARE)) != 0) { return (error); } return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, VDEV_LABEL_L2CACHE)); } static void spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, const char *config) { int i; if (sav->sav_config != NULL) { nvlist_t **olddevs; uint_t oldndevs; nvlist_t **newdevs; /* * Generate new dev list by concatenating with the * current dev list. */ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, &olddevs, &oldndevs) == 0); newdevs = kmem_alloc(sizeof (void *) * (ndevs + oldndevs), KM_SLEEP); for (i = 0; i < oldndevs; i++) VERIFY(nvlist_dup(olddevs[i], &newdevs[i], KM_SLEEP) == 0); for (i = 0; i < ndevs; i++) VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], KM_SLEEP) == 0); VERIFY(nvlist_remove(sav->sav_config, config, DATA_TYPE_NVLIST_ARRAY) == 0); VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, newdevs, ndevs + oldndevs) == 0); for (i = 0; i < oldndevs + ndevs; i++) nvlist_free(newdevs[i]); kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); } else { /* * Generate a new dev list. */ VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, devs, ndevs) == 0); } } /* * Stop and drop level 2 ARC devices */ void spa_l2cache_drop(spa_t *spa) { vdev_t *vd; int i; spa_aux_vdev_t *sav = &spa->spa_l2cache; for (i = 0; i < sav->sav_count; i++) { uint64_t pool; vd = sav->sav_vdevs[i]; ASSERT(vd != NULL); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); } } /* * Verify encryption parameters for spa creation. If we are encrypting, we must * have the encryption feature flag enabled. */ static int spa_create_check_encryption_params(dsl_crypto_params_t *dcp, boolean_t has_encryption) { if (dcp->cp_crypt != ZIO_CRYPT_OFF && dcp->cp_crypt != ZIO_CRYPT_INHERIT && !has_encryption) return (SET_ERROR(ENOTSUP)); return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); } /* * Pool Creation */ int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops, dsl_crypto_params_t *dcp) { spa_t *spa; char *altroot = NULL; vdev_t *rvd; dsl_pool_t *dp; dmu_tx_t *tx; int error = 0; uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; uint64_t version, obj; boolean_t has_features; boolean_t has_encryption; boolean_t has_allocclass; spa_feature_t feat; char *feat_name; char *poolname; nvlist_t *nvl; if (props == NULL || nvlist_lookup_string(props, "tname", &poolname) != 0) poolname = (char *)pool; /* * If this pool already exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(poolname) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Allocate a new spa_t structure. */ nvl = fnvlist_alloc(); fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(poolname, nvl, altroot); fnvlist_free(nvl); spa_activate(spa, spa_mode_global); if (props && (error = spa_prop_validate(spa, props))) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Temporary pool names should never be written to disk. */ if (poolname != pool) spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; has_features = B_FALSE; has_encryption = B_FALSE; has_allocclass = B_FALSE; for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); elem != NULL; elem = nvlist_next_nvpair(props, elem)) { if (zpool_prop_feature(nvpair_name(elem))) { has_features = B_TRUE; feat_name = strchr(nvpair_name(elem), '@') + 1; VERIFY0(zfeature_lookup_name(feat_name, &feat)); if (feat == SPA_FEATURE_ENCRYPTION) has_encryption = B_TRUE; if (feat == SPA_FEATURE_ALLOCATION_CLASSES) has_allocclass = B_TRUE; } } /* verify encryption params, if they were provided */ if (dcp != NULL) { error = spa_create_check_encryption_params(dcp, has_encryption); if (error != 0) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } } if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (ENOTSUP); } if (has_features || nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { version = SPA_VERSION; } ASSERT(SPA_VERSION_IS_SUPPORTED(version)); spa->spa_first_txg = txg; spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; spa->spa_load_state = SPA_LOAD_CREATE; spa->spa_removing_phys.sr_state = DSS_NONE; spa->spa_removing_phys.sr_removing_vdev = -1; spa->spa_removing_phys.sr_prev_indirect_vdev = -1; spa->spa_indirect_vdevs_loaded = B_TRUE; /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Create the root vdev. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); ASSERT(error != 0 || rvd != NULL); ASSERT(error != 0 || spa->spa_root_vdev == rvd); if (error == 0 && !zfs_allocatable_devs(nvroot)) error = SET_ERROR(EINVAL); if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { /* * instantiate the metaslab groups (this will dirty the vdevs) * we can no longer error exit past this point */ for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_ashift_optimize(vd); vdev_metaslab_set_size(vd); vdev_expand(vd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Get the list of spares, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } /* * Get the list of level 2 cache devices, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } spa->spa_is_initializing = B_TRUE; spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); spa->spa_is_initializing = B_FALSE; /* * Create DDTs (dedup tables). */ ddt_create(spa); spa_update_dspace(spa); tx = dmu_tx_create_assigned(dp, txg); /* * Create the pool's history object. */ if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) spa_history_create_obj(spa, tx); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); spa_history_log_version(spa, "create", tx); /* * Create the pool config object. */ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool config"); } if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, sizeof (uint64_t), 1, &version, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool version"); } /* Newly created pools with the right version are always deflated. */ if (version >= SPA_VERSION_RAIDZ_DEFLATE) { spa->spa_deflate = TRUE; if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { cmn_err(CE_PANIC, "failed to add deflate"); } } /* * Create the deferred-free bpobj. Turn off compression * because sync-to-convergence takes longer if the blocksize * keeps changing. */ obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); dmu_object_set_compress(spa->spa_meta_objset, obj, ZIO_COMPRESS_OFF, tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, sizeof (uint64_t), 1, &obj, tx) != 0) { cmn_err(CE_PANIC, "failed to add bpobj"); } VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj)); /* * Generate some random noise for salted checksums to operate on. */ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); /* * Set pool properties. */ spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_sync_props(props, tx); } dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; txg_sync_start(dp); mmp_thread_start(spa); txg_wait_synced(dp, txg); spa_spawn_aux_threads(spa); spa_write_cachefile(spa, B_FALSE, B_TRUE); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); spa->spa_load_state = SPA_LOAD_NONE; mutex_exit(&spa_namespace_lock); return (0); } /* * Import a non-root pool into the system. */ int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; char *altroot = NULL; spa_load_state_t state = SPA_LOAD_IMPORT; zpool_load_policy_t policy; spa_mode_t mode = spa_mode_global; uint64_t readonly = B_FALSE; int error; nvlist_t *nvroot; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; /* * If a pool with this name exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Create and initialize the spa structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); if (readonly) mode = SPA_MODE_READ; spa = spa_add(pool, config, altroot); spa->spa_import_flags = flags; /* * Verbatim import - Take a pool and insert it into the namespace * as if it had been loaded at boot. */ if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { if (props != NULL) spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); zfs_dbgmsg("spa_import: verbatim import of %s", pool); mutex_exit(&spa_namespace_lock); return (0); } spa_activate(spa, mode); /* * Don't start async tasks until we know everything is healthy. */ spa_async_suspend(spa); zpool_get_load_policy(config, &policy); if (policy.zlp_rewind & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; if (state != SPA_LOAD_RECOVER) { spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; zfs_dbgmsg("spa_import: importing %s", pool); } else { zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); } error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); /* * Propagate anything learned while loading the pool and pass it * back to caller (i.e. rewind info, missing devices, etc). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Toss any existing sparelist, as it doesn't have any validity * anymore, and conflicts with spa_has_spare(). */ if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; spa_load_spares(spa); } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; spa_load_l2cache(spa); } VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); spa_config_exit(spa, SCL_ALL, FTAG); if (props != NULL) spa_configfile_set(spa, props, B_FALSE); if (error != 0 || (props && spa_writeable(spa) && (error = spa_prop_set(spa, props)))) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } spa_async_resume(spa); /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { if (spa->spa_spares.sav_config) VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); else VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { if (spa->spa_l2cache.sav_config) VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); else VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } /* * Check for any removed devices. */ if (spa->spa_autoreplace) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } if (spa_writeable(spa)) { /* * Update the config cache to include the newly-imported pool. */ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } /* * It's possible that the pool was expanded while it was exported. * We kick off an async task to handle this for us. */ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); spa_history_log_version(spa, "import", NULL); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); mutex_exit(&spa_namespace_lock); zvol_create_minors_recursive(pool); return (0); } nvlist_t * spa_tryimport(nvlist_t *tryconfig) { nvlist_t *config = NULL; char *poolname, *cachefile; spa_t *spa; uint64_t state; int error; zpool_load_policy_t policy; if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) return (NULL); if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) return (NULL); /* * Create and initialize the spa structure. */ mutex_enter(&spa_namespace_lock); spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); spa_activate(spa, SPA_MODE_READ); /* * Rewind pool if a max txg was provided. */ zpool_get_load_policy(spa->spa_config, &policy); if (policy.zlp_txg != UINT64_MAX) { spa->spa_load_max_txg = policy.zlp_txg; spa->spa_extreme_rewind = B_TRUE; zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", poolname, (longlong_t)policy.zlp_txg); } else { zfs_dbgmsg("spa_tryimport: importing %s", poolname); } if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) == 0) { zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; } else { spa->spa_config_source = SPA_CONFIG_SRC_SCAN; } error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); /* * If 'tryconfig' was at least parsable, return the current config. */ if (spa->spa_root_vdev != NULL) { config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, spa->spa_uberblock.ub_timestamp) == 0); VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata) == 0); /* * If the bootfs property exists on this pool then we * copy it out so that external consumers can tell which * pools are bootable. */ if ((!error || error == EEXIST) && spa->spa_bootfs) { char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); /* * We have to play games with the name since the * pool was opened as TRYIMPORT_NAME. */ if (dsl_dsobj_to_dsname(spa_name(spa), spa->spa_bootfs, tmpname) == 0) { char *cp; char *dsname; dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); cp = strchr(tmpname, '/'); if (cp == NULL) { (void) strlcpy(dsname, tmpname, MAXPATHLEN); } else { (void) snprintf(dsname, MAXPATHLEN, "%s/%s", poolname, ++cp); } VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, dsname) == 0); kmem_free(dsname, MAXPATHLEN); } kmem_free(tmpname, MAXPATHLEN); } /* * Add the list of hot spares and level 2 cache devices. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_add_spares(spa, config); spa_add_l2cache(spa, config); spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (config); } /* * Pool export/destroy * * The act of destroying or exporting a pool is very simple. We make sure there * is no more pending I/O and any references to the pool are gone. Then, we * update the pool state and sync all the labels to disk, removing the * configuration from the cache afterwards. If the 'hardforce' flag is set, then * we don't sync the labels or remove the configuration cache. */ static int spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { spa_t *spa; if (oldconfig) *oldconfig = NULL; if (!(spa_mode_global & SPA_MODE_WRITE)) return (SET_ERROR(EROFS)); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pool)) == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (spa->spa_is_exporting) { /* the pool is being exported by another thread */ mutex_exit(&spa_namespace_lock); return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); } spa->spa_is_exporting = B_TRUE; /* * Put a hold on the pool, drop the namespace lock, stop async tasks, * reacquire the namespace lock, and see if we can export. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); if (spa->spa_zvol_taskq) { zvol_remove_minors(spa, spa_name(spa), B_TRUE); taskq_wait(spa->spa_zvol_taskq); } mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); if (spa->spa_state == POOL_STATE_UNINITIALIZED) goto export_spa; /* * The pool will be in core if it's openable, in which case we can * modify its state. Objsets may be open only because they're dirty, * so we have to force it to sync before checking spa_refcnt. */ if (spa->spa_sync_on) { txg_wait_synced(spa->spa_dsl_pool, 0); spa_evicting_os_wait(spa); } /* * A pool cannot be exported or destroyed if there are active * references. If we are resetting a pool, allow references by * fault injection handlers. */ if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0 && new_state != POOL_STATE_UNINITIALIZED)) { spa_async_resume(spa); spa->spa_is_exporting = B_FALSE; mutex_exit(&spa_namespace_lock); return (SET_ERROR(EBUSY)); } if (spa->spa_sync_on) { /* * A pool cannot be exported if it has an active shared spare. * This is to prevent other pools stealing the active spare * from an exported pool. At user's own will, such pool can * be forcedly exported. */ if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { spa_async_resume(spa); spa->spa_is_exporting = B_FALSE; mutex_exit(&spa_namespace_lock); return (SET_ERROR(EXDEV)); } /* * We're about to export or destroy this pool. Make sure * we stop all initialization and trim activity here before * we set the spa_final_txg. This will ensure that all * dirty data resulting from the initialization is * committed to disk before we unload the pool. */ if (spa->spa_root_vdev != NULL) { vdev_t *rvd = spa->spa_root_vdev; vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); vdev_rebuild_stop_all(spa); } /* * We want this to be reflected on every label, * so mark them all dirty. spa_unload() will do the * final sync that pushes these changes out. */ if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; spa->spa_final_txg = spa_last_synced_txg(spa) + TXG_DEFER_SIZE + 1; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); } } export_spa: if (new_state == POOL_STATE_DESTROYED) spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); else if (new_state == POOL_STATE_EXPORTED) spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } if (oldconfig && spa->spa_config) VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); } else { /* * If spa_remove() is not called for this spa_t and * there is any possibility that it can be reused, * we make sure to reset the exporting flag. */ spa->spa_is_exporting = B_FALSE; } mutex_exit(&spa_namespace_lock); return (0); } /* * Destroy a storage pool. */ int spa_destroy(char *pool) { return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE, B_FALSE)); } /* * Export a storage pool. */ int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force, hardforce)); } /* * Similar to spa_export(), this unloads the spa_t without actually removing it * from the namespace in any way. */ int spa_reset(char *pool) { return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, B_FALSE, B_FALSE)); } /* * ========================================================================== * Device manipulation * ========================================================================== */ /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { uint64_t txg; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, NULL, txg, error)); spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) != 0) nl2cache = 0; if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); if (vd->vdev_children != 0 && (error = vdev_create(vd, txg, B_FALSE)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * We must validate the spares and l2cache devices after checking the * children. Otherwise, vdev_inuse() will blindly overwrite the spare. */ if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * If we are in the middle of a device removal, we can only add * devices which match the existing devices in the pool. * If we are in the middle of a removal, or have some indirect * vdevs, we can not add raidz toplevels. */ if (spa->spa_vdev_removal != NULL || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; if (spa->spa_vdev_removal != NULL && tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } /* Fail if top level vdev is raidz */ if (tvd->vdev_ops == &vdev_raidz_ops) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } /* * Need the top level mirror to be * a mirror of leaf vdevs only */ if (tvd->vdev_ops == &vdev_mirror_ops) { for (uint64_t cid = 0; cid < tvd->vdev_children; cid++) { vdev_t *cvd = tvd->vdev_child[cid]; if (!cvd->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } } } } } for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); tvd->vdev_id = rvd->vdev_children; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); } if (nspares != 0) { spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, ZPOOL_CONFIG_SPARES); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } if (nl2cache != 0) { spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, ZPOOL_CONFIG_L2CACHE); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we * sync the config cache, and we lose power, then upon reboot we may * fail to open the pool because there are DVAs that the config cache * can't translate. Therefore, we first add the vdevs without * initializing metaslabs; sync the config cache (via spa_vdev_exit()); * and then let spa_config_update() initialize the new metaslabs. * * spa_load() checks for added-but-not-initialized vdevs, so that * if we lose power at any point in this sequence, the remaining * steps will be completed the next time we load the pool. */ (void) spa_vdev_exit(spa, vd, txg, 0); mutex_enter(&spa_namespace_lock); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); mutex_exit(&spa_namespace_lock); return (0); } /* * Attach a device to a mirror. The arguments are the path to any device * in the mirror, and the nvroot for the new device. If the path specifies * a device that is not mirrored, we automatically insert the mirror vdev. * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own * mirror using the 'replacing' vdev, which is functionally identical to * the mirror vdev (it actually reuses all the same ops) but has a few * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) * is automatically detached. * * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) * should be performed instead of traditional healing reconstruction. From * an administrators perspective these are both resilver operations. */ int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, int rebuild) { uint64_t txg, dtl_max_txg; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; int newvd_isspare; int error; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } if (rebuild) { if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); if (dsl_scan_resilvering(spa_get_dsl(spa))) return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_RESILVER_IN_PROGRESS)); } else { if (vdev_rebuild_active(rvd)) return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_REBUILD_IN_PROGRESS)); } if (spa->spa_vdev_removal != NULL) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!oldvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = oldvd->vdev_parent; if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, VDEV_ALLOC_ATTACH)) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); if (newrootvd->vdev_children != 1) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); newvd = newrootvd->vdev_child[0]; if (!newvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); if ((error = vdev_create(newrootvd, txg, replacing)) != 0) return (spa_vdev_exit(spa, newrootvd, txg, error)); /* * Spares can't replace logs */ if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); if (rebuild) { /* * For rebuilds, the parent vdev must support reconstruction * using only space maps. This means the only allowable * parents are the root vdev or a mirror vdev. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } } if (!replacing) { /* * For attach, the only allowable parent is a mirror or the root * vdev. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); pvops = &vdev_mirror_ops; } else { /* * Active hot spares can only be replaced by inactive hot * spares. */ if (pvd->vdev_ops == &vdev_spare_ops && oldvd->vdev_isspare && !spa_has_spare(spa, newvd->vdev_guid)) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If the source is a hot spare, and the parent isn't already a * spare, then we want to create a new hot spare. Otherwise, we * want to create a replacing vdev. The user is not allowed to * attach to a spared vdev child unless the 'isspare' state is * the same (spare replaces spare, non-spare replaces * non-spare). */ if (pvd->vdev_ops == &vdev_replacing_ops && spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } else if (pvd->vdev_ops == &vdev_spare_ops && newvd->vdev_isspare != oldvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } if (newvd->vdev_isspare) pvops = &vdev_spare_ops; else pvops = &vdev_replacing_ops; } /* * Make sure the new device is big enough. */ if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* * The new device cannot have a higher alignment requirement * than the top-level vdev. */ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. */ if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { spa_strfree(oldvd->vdev_path); oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, KM_SLEEP); (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5, "%s/%s", newvd->vdev_path, "old"); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } } /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ if (pvd->vdev_ops != pvops) pvd = vdev_add_parent(oldvd, pvops); ASSERT(pvd->vdev_top->vdev_parent == rvd); ASSERT(pvd->vdev_ops == pvops); ASSERT(oldvd->vdev_parent == pvd); /* * Extract the new device from its root and add it to pvd. */ vdev_remove_child(newrootvd, newvd); newvd->vdev_id = pvd->vdev_children; newvd->vdev_crtxg = oldvd->vdev_crtxg; vdev_add_child(pvd, newvd); /* * Reevaluate the parent vdev state. */ vdev_propagate_state(pvd); tvd = newvd->vdev_top; ASSERT(pvd->vdev_top == tvd); ASSERT(tvd->vdev_parent == rvd); vdev_config_dirty(tvd); /* * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account * for any dmu_sync-ed blocks. It will propagate upward when * spa_vdev_exit() calls vdev_dtl_reassess(). */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); } oldvdpath = spa_strdup(oldvd->vdev_path); newvdpath = spa_strdup(newvd->vdev_path); newvd_isspare = newvd->vdev_isspare; /* * Mark newvd's DTL dirty in this txg. */ vdev_dirty(tvd, VDD_DTL, newvd, txg); /* * Schedule the resilver or rebuild to restart in the future. We do * this to ensure that dmu_sync-ed blocks have been stitched into the * respective datasets. */ if (rebuild) { newvd->vdev_rebuild_txg = txg; vdev_rebuild(tvd); } else { newvd->vdev_resilver_txg = txg; if (dsl_scan_resilvering(spa_get_dsl(spa)) && spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { vdev_defer_resilver(newvd); } else { dsl_scan_restart_resilver(spa->spa_dsl_pool, dtl_max_txg); } } if (spa->spa_bootfs) spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); /* * Commit the config */ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); spa_history_log_internal(spa, "vdev attach", NULL, "%s vdev=%s %s vdev=%s", replacing && newvd_isspare ? "spare in" : replacing ? "replace" : "attach", newvdpath, replacing ? "for" : "to", oldvdpath); spa_strfree(oldvdpath); spa_strfree(newvdpath); return (0); } /* * Detach a device from a mirror or replacing vdev. * * If 'replace_done' is specified, only detach if the parent * is a replacing vdev. */ int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) { uint64_t txg; int error; vdev_t *rvd __maybe_unused = spa->spa_root_vdev; vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; uint64_t unspare_guid = 0; char *vdpath; ASSERT(spa_writeable(spa)); txg = spa_vdev_detach_enter(spa, guid); vd = spa_lookup_by_guid(spa, guid, B_FALSE); /* * Besides being called directly from the userland through the * ioctl interface, spa_vdev_detach() can be potentially called * at the end of spa_vdev_resilver_done(). * * In the regular case, when we have a checkpoint this shouldn't * happen as we never empty the DTLs of a vdev during the scrub * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() * should never get here when we have a checkpoint. * * That said, even in a case when we checkpoint the pool exactly * as spa_vdev_resilver_done() calls this function everything * should be fine as the resilver will return right away. */ ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } if (vd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = vd->vdev_parent; /* * If the parent/child relationship is not as expected, don't do it. * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing * vdev that's replacing B with C. The user's intent in replacing * is to go from M(A,B) to M(A,C). If the user decides to cancel * the replace by detaching C, the expected behavior is to end up * M(A,B). But suppose that right after deciding to detach C, * the replacement of B completes. We would have M(A,C), and then * ask to detach C, which would leave us with just A -- not what * the user wanted. To prevent this, we make sure that the * parent/child relationship hasn't changed -- in this example, * that C's parent is still the replacing vdev R. */ if (pvd->vdev_guid != pguid && pguid != 0) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); /* * Only 'replacing' or 'spare' vdevs can be replaced. */ if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ASSERT(pvd->vdev_ops != &vdev_spare_ops || spa_version(spa) >= SPA_VERSION_SPARES); /* * Only mirror, replacing, and spare vdevs support detach. */ if (pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); /* * If this device has the only valid copy of some data, * we cannot safely detach it. */ if (vdev_dtl_required(vd)) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); ASSERT(pvd->vdev_children >= 2); /* * If we are detaching the second disk from a replacing vdev, then * check to see if we changed the original vdev's path to have "/old" * at the end in spa_vdev_attach(). If so, undo that change now. */ if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && vd->vdev_path != NULL) { size_t len = strlen(vd->vdev_path); for (int c = 0; c < pvd->vdev_children; c++) { cvd = pvd->vdev_child[c]; if (cvd == vd || cvd->vdev_path == NULL) continue; if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && strcmp(cvd->vdev_path + len, "/old") == 0) { spa_strfree(cvd->vdev_path); cvd->vdev_path = spa_strdup(vd->vdev_path); break; } } } /* * If we are detaching the original disk from a spare, then it implies * that the spare should become a real disk, and be removed from the * active spare list for the pool. */ if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0 && pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) unspare = B_TRUE; /* * Erase the disk labels so the disk can be used for other things. * This must be done after all other error cases are handled, * but before we disembowel vd (so we can still do I/O to it). * But if we can't do it, don't treat the error as fatal -- * it may be that the unwritability of the disk is the reason * it's being detached! */ error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Remove vd from its parent and compact the parent's children. */ vdev_remove_child(pvd, vd); vdev_compact_children(pvd); /* * Remember one of the remaining children so we can get tvd below. */ cvd = pvd->vdev_child[pvd->vdev_children - 1]; /* * If we need to remove the remaining child from the list of hot spares, * do it now, marking the vdev as no longer a spare in the process. * We must do this before vdev_remove_parent(), because that can * change the GUID if it creates a new toplevel GUID. For a similar * reason, we must remove the spare now, in the same txg as the detach; * otherwise someone could attach a new sibling, change the GUID, and * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. */ if (unspare) { ASSERT(cvd->vdev_isspare); spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); cvd->vdev_unspare = B_TRUE; } /* * If the parent mirror/replacing vdev only has one child, * the parent is no longer needed. Remove it from the tree. */ if (pvd->vdev_children == 1) { if (pvd->vdev_ops == &vdev_spare_ops) cvd->vdev_unspare = B_FALSE; vdev_remove_parent(cvd); } /* * We don't set tvd until now because the parent we just removed * may have been the previous top-level vdev. */ tvd = cvd->vdev_top; ASSERT(tvd->vdev_parent == rvd); /* * Reevaluate the parent vdev state. */ vdev_propagate_state(cvd); /* * If the 'autoexpand' property is set on the pool then automatically * try to expand the size of the pool. For example if the device we * just detached was smaller than the others, it may be possible to * add metaslabs (i.e. grow the pool). We need to reopen the vdev * first so that we can obtain the updated sizes of the leaf vdevs. */ if (spa->spa_autoexpand) { vdev_reopen(tvd); vdev_expand(tvd, txg); } vdev_config_dirty(tvd); /* * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that * vd->vdev_detached is set and free vd's DTL object in syncing context. * But first make sure we're not on any *other* txg's DTL list, to * prevent vd from being accessed after it's freed. */ vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); for (int t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); spa_notify_waiters(spa); /* hang on to the spa before we release the lock */ spa_open_ref(spa, FTAG); error = spa_vdev_exit(spa, vd, txg, 0); spa_history_log_internal(spa, "detach", NULL, "vdev=%s", vdpath); spa_strfree(vdpath); /* * If this was the removal of the original device in a hot spare vdev, * then we want to go through and remove the device from the hot spare * list of every other pool. */ if (unspare) { spa_t *altspa = NULL; mutex_enter(&spa_namespace_lock); while ((altspa = spa_next(altspa)) != NULL) { if (altspa->spa_state != POOL_STATE_ACTIVE || altspa == spa) continue; spa_open_ref(altspa, FTAG); mutex_exit(&spa_namespace_lock); (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); mutex_enter(&spa_namespace_lock); spa_close(altspa, FTAG); } mutex_exit(&spa_namespace_lock); /* search the rest of the vdevs for spares to remove */ spa_vdev_resilver_done(spa); } /* all done with the spa; OK to release */ mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); mutex_exit(&spa_namespace_lock); return (error); } static int spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, list_t *vd_list) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); /* Look up vdev and ensure it's a leaf. */ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || vd->vdev_detached) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(ENODEV)); } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EINVAL)); } else if (!vdev_writeable(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EROFS)); } mutex_enter(&vd->vdev_initialize_lock); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); /* * When we activate an initialize action we check to see * if the vdev_initialize_thread is NULL. We do this instead * of using the vdev_initialize_state since there might be * a previous initialization process which has completed but * the thread is not exited. */ if (cmd_type == POOL_INITIALIZE_START && (vd->vdev_initialize_thread != NULL || vd->vdev_top->vdev_removing)) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_INITIALIZE_CANCEL && (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(ESRCH)); } else if (cmd_type == POOL_INITIALIZE_SUSPEND && vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(ESRCH)); } switch (cmd_type) { case POOL_INITIALIZE_START: vdev_initialize(vd); break; case POOL_INITIALIZE_CANCEL: vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); break; case POOL_INITIALIZE_SUSPEND: vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); break; default: panic("invalid cmd_type %llu", (unsigned long long)cmd_type); } mutex_exit(&vd->vdev_initialize_lock); return (0); } int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, nvlist_t *vdev_errlist) { int total_errors = 0; list_t vd_list; list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); /* * We hold the namespace lock through the whole function * to prevent any changes to the pool while we're starting or * stopping initialization. The config and state locks are held so that * we can properly assess the vdev state before we commit to * the initializing operation. */ mutex_enter(&spa_namespace_lock); for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { uint64_t vdev_guid = fnvpair_value_uint64(pair); int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, &vd_list); if (error != 0) { char guid_as_str[MAXNAMELEN]; (void) snprintf(guid_as_str, sizeof (guid_as_str), "%llu", (unsigned long long)vdev_guid); fnvlist_add_int64(vdev_errlist, guid_as_str, error); total_errors++; } } /* Wait for all initialize threads to stop. */ vdev_initialize_stop_wait(spa, &vd_list); /* Sync out the initializing state */ txg_wait_synced(spa->spa_dsl_pool, 0); mutex_exit(&spa_namespace_lock); list_destroy(&vd_list); return (total_errors); } static int spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); /* Look up vdev and ensure it's a leaf. */ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || vd->vdev_detached) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(ENODEV)); } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EINVAL)); } else if (!vdev_writeable(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EROFS)); } else if (!vd->vdev_has_trim) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EOPNOTSUPP)); } else if (secure && !vd->vdev_has_securetrim) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EOPNOTSUPP)); } mutex_enter(&vd->vdev_trim_lock); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); /* * When we activate a TRIM action we check to see if the * vdev_trim_thread is NULL. We do this instead of using the * vdev_trim_state since there might be a previous TRIM process * which has completed but the thread is not exited. */ if (cmd_type == POOL_TRIM_START && (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_TRIM_CANCEL && (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(ESRCH)); } else if (cmd_type == POOL_TRIM_SUSPEND && vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(ESRCH)); } switch (cmd_type) { case POOL_TRIM_START: vdev_trim(vd, rate, partial, secure); break; case POOL_TRIM_CANCEL: vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); break; case POOL_TRIM_SUSPEND: vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); break; default: panic("invalid cmd_type %llu", (unsigned long long)cmd_type); } mutex_exit(&vd->vdev_trim_lock); return (0); } /* * Initiates a manual TRIM for the requested vdevs. This kicks off individual * TRIM threads for each child vdev. These threads pass over all of the free * space in the vdev's metaslabs and issues TRIM commands for that space. */ int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) { int total_errors = 0; list_t vd_list; list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); /* * We hold the namespace lock through the whole function * to prevent any changes to the pool while we're starting or * stopping TRIM. The config and state locks are held so that * we can properly assess the vdev state before we commit to * the TRIM operation. */ mutex_enter(&spa_namespace_lock); for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { uint64_t vdev_guid = fnvpair_value_uint64(pair); int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, rate, partial, secure, &vd_list); if (error != 0) { char guid_as_str[MAXNAMELEN]; (void) snprintf(guid_as_str, sizeof (guid_as_str), "%llu", (unsigned long long)vdev_guid); fnvlist_add_int64(vdev_errlist, guid_as_str, error); total_errors++; } } /* Wait for all TRIM threads to stop. */ vdev_trim_stop_wait(spa, &vd_list); /* Sync out the TRIM state */ txg_wait_synced(spa->spa_dsl_pool, 0); mutex_exit(&spa_namespace_lock); list_destroy(&vd_list); return (total_errors); } /* * Split a set of devices from their mirrors, and create a new pool from them. */ int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp) { int error = 0; uint64_t txg, *glist; spa_t *newspa; uint_t c, children, lastlog; nvlist_t **child, *nvl, *tmp; dmu_tx_t *tx; char *altroot = NULL; vdev_t *rvd, **vml = NULL; /* vdev modify list */ boolean_t activate_slog; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } /* clear the log and flush everything up to now */ activate_slog = spa_passivate_log(spa); (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); error = spa_reset_logs(spa); txg = spa_vdev_config_enter(spa); if (activate_slog) spa_activate_log(spa); if (error != 0) return (spa_vdev_exit(spa, NULL, txg, error)); /* check new spa name before going any further */ if (spa_lookup(newname) != NULL) return (spa_vdev_exit(spa, NULL, txg, EEXIST)); /* * scan through all the children to ensure they're all mirrors */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* first, check to ensure we've got the right child count */ rvd = spa->spa_root_vdev; lastlog = 0; for (c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && !vdev_is_concrete(vd))) { if (lastlog == 0) lastlog = c; continue; } lastlog = 0; } if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* next, ensure no spare or cache devices are part of the split */ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); /* then, loop over each vdev and validate it */ for (c = 0; c < children; c++) { uint64_t is_hole = 0; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_hole != 0) { if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || spa->spa_root_vdev->vdev_child[c]->vdev_islog) { continue; } else { error = SET_ERROR(EINVAL); break; } } /* deal with indirect vdevs */ if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == &vdev_indirect_ops) continue; /* which disk is going to be split? */ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, &glist[c]) != 0) { error = SET_ERROR(EINVAL); break; } /* look it up in the spa */ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); if (vml[c] == NULL) { error = SET_ERROR(ENODEV); break; } /* make sure there's nothing stopping the split */ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || vml[c]->vdev_islog || !vdev_is_concrete(vml[c]) || vml[c]->vdev_isspare || vml[c]->vdev_isl2cache || !vdev_writeable(vml[c]) || vml[c]->vdev_children != 0 || vml[c]->vdev_state != VDEV_STATE_HEALTHY || c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { error = SET_ERROR(EINVAL); break; } if (vdev_dtl_required(vml[c]) || vdev_resilver_needed(vml[c], NULL, NULL)) { error = SET_ERROR(EBUSY); break; } /* we need certain info from the top level */ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, vml[c]->vdev_top->vdev_ms_array) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, vml[c]->vdev_top->vdev_ms_shift) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, vml[c]->vdev_top->vdev_asize) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, vml[c]->vdev_top->vdev_ashift) == 0); /* transfer per-vdev ZAPs */ ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); VERIFY0(nvlist_add_uint64(child[c], ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); VERIFY0(nvlist_add_uint64(child[c], ZPOOL_CONFIG_VDEV_TOP_ZAP, vml[c]->vdev_parent->vdev_top_zap)); } if (error != 0) { kmem_free(vml, children * sizeof (vdev_t *)); kmem_free(glist, children * sizeof (uint64_t)); return (spa_vdev_exit(spa, NULL, txg, error)); } /* stop writers from using the disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_TRUE; } vdev_reopen(spa->spa_root_vdev); /* * Temporarily record the splitting vdevs in the spa config. This * will disappear once the config is regenerated. */ VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children) == 0); kmem_free(glist, children * sizeof (uint64_t)); mutex_enter(&spa->spa_props_lock); VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl) == 0); mutex_exit(&spa->spa_props_lock); spa->spa_config_splitting = nvl; vdev_config_dirty(spa->spa_root_vdev); /* configure and create the new pool */ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_generate_guid(NULL)) == 0); VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); /* add the new pool to the namespace */ newspa = spa_add(newname, config, altroot); newspa->spa_avz_action = AVZ_ACTION_REBUILD; newspa->spa_config_txg = spa->spa_config_txg; spa_set_log_state(newspa, SPA_LOG_CLEAR); /* release the spa config lock, retaining the namespace lock */ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 1); spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); /* * Temporarily stop the initializing and TRIM activity. We set the * state to ACTIVE so that we know to resume initializing or TRIM * once the split has completed. */ list_t vd_initialize_list; list_create(&vd_initialize_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); list_t vd_trim_list; list_create(&vd_trim_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); for (c = 0; c < children; c++) { if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { mutex_enter(&vml[c]->vdev_initialize_lock); vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); mutex_exit(&vml[c]->vdev_initialize_lock); mutex_enter(&vml[c]->vdev_trim_lock); vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); mutex_exit(&vml[c]->vdev_trim_lock); } } vdev_initialize_stop_wait(spa, &vd_initialize_list); vdev_trim_stop_wait(spa, &vd_trim_list); list_destroy(&vd_initialize_list); list_destroy(&vd_trim_list); newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; newspa->spa_is_splitting = B_TRUE; /* create the new pool from the disks of the original pool */ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); if (error) goto out; /* if that worked, generate a real config for the new pool */ if (newspa->spa_root_vdev != NULL) { VERIFY(nvlist_alloc(&newspa->spa_config_splitting, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, B_TRUE)); } /* set the props */ if (props != NULL) { spa_configfile_set(newspa, props, B_FALSE); error = spa_prop_set(newspa, props); if (error) goto out; } /* flush everything */ txg = spa_vdev_config_enter(newspa); vdev_config_dirty(newspa->spa_root_vdev); (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 2); spa_async_resume(newspa); /* finally, update the original pool's config */ txg = spa_vdev_config_enter(spa); tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) dmu_tx_abort(tx); for (c = 0; c < children; c++) { if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { vdev_t *tvd = vml[c]->vdev_top; /* * Need to be sure the detachable VDEV is not * on any *other* txg's DTL list to prevent it * from being accessed after it's freed. */ for (int t = 0; t < TXG_SIZE; t++) { (void) txg_list_remove_this( &tvd->vdev_dtl_list, vml[c], t); } vdev_split(vml[c]); if (error == 0) spa_history_log_internal(spa, "detach", tx, "vdev=%s", vml[c]->vdev_path); vdev_free(vml[c]); } } spa->spa_avz_action = AVZ_ACTION_REBUILD; vdev_config_dirty(spa->spa_root_vdev); spa->spa_config_splitting = NULL; nvlist_free(nvl); if (error == 0) dmu_tx_commit(tx); (void) spa_vdev_exit(spa, NULL, txg, 0); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 3); /* split is complete; log a history record */ spa_history_log_internal(newspa, "split", NULL, "from pool %s", spa_name(spa)); newspa->spa_is_splitting = B_FALSE; kmem_free(vml, children * sizeof (vdev_t *)); /* if we're not going to mount the filesystems in userland, export */ if (exp) error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, B_FALSE, B_FALSE); return (error); out: spa_unload(newspa); spa_deactivate(newspa); spa_remove(newspa); txg = spa_vdev_config_enter(spa); /* re-online all offlined disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_FALSE; } /* restart initializing or trimming disks as necessary */ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); vdev_reopen(spa->spa_root_vdev); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; (void) spa_vdev_exit(spa, NULL, txg, error); kmem_free(vml, children * sizeof (vdev_t *)); return (error); } /* * Find any device that's done replacing, or a vdev marked 'unspare' that's * currently spared, so we can detach it. */ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; for (int c = 0; c < vd->vdev_children; c++) { oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); } /* * Check for a completed replacement. We always consider the first * vdev in the list to be the oldest vdev, and the last one to be * the newest (see spa_vdev_attach() for how that works). In * the case where the newest vdev is faulted, we will not automatically * remove it after a resilver completes. This is OK as it will require * user intervention to determine which disk the admin wishes to keep. */ if (vd->vdev_ops == &vdev_replacing_ops) { ASSERT(vd->vdev_children > 1); newvd = vd->vdev_child[vd->vdev_children - 1]; oldvd = vd->vdev_child[0]; if (vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); } /* * Check for a completed resilver with the 'unspare' flag set. * Also potentially update faulted state. */ if (vd->vdev_ops == &vdev_spare_ops) { vdev_t *first = vd->vdev_child[0]; vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; if (last->vdev_unspare) { oldvd = first; newvd = last; } else if (first->vdev_unspare) { oldvd = last; newvd = first; } else { oldvd = NULL; } if (oldvd != NULL && vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); vdev_propagate_state(vd); /* * If there are more than two spares attached to a disk, * and those spares are not required, then we want to * attempt to free them up now so that they can be used * by other pools. Once we're back down to a single * disk+spare, we stop removing them. */ if (vd->vdev_children > 2) { newvd = vd->vdev_child[1]; if (newvd->vdev_isspare && last->vdev_isspare && vdev_dtl_empty(last, DTL_MISSING) && vdev_dtl_empty(last, DTL_OUTAGE) && !vdev_dtl_required(newvd)) return (newvd); } } return (NULL); } static void spa_vdev_resilver_done(spa_t *spa) { vdev_t *vd, *pvd, *ppvd; uint64_t guid, sguid, pguid, ppguid; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { pvd = vd->vdev_parent; ppvd = pvd->vdev_parent; guid = vd->vdev_guid; pguid = pvd->vdev_guid; ppguid = ppvd->vdev_guid; sguid = 0; /* * If we have just finished replacing a hot spared device, then * we need to detach the parent's first child (the original hot * spare) as well. */ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && ppvd->vdev_children == 2) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); sguid = ppvd->vdev_child[1]->vdev_guid; } ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); spa_config_exit(spa, SCL_ALL, FTAG); if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) return; if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) return; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); } spa_config_exit(spa, SCL_ALL, FTAG); /* * If a detach was not performed above replace waiters will not have * been notified. In which case we must do so now. */ spa_notify_waiters(spa); } /* * Update the stored path or FRU for this vdev. */ static int spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, boolean_t ispath) { vdev_t *vd; boolean_t sync = B_FALSE; ASSERT(spa_writeable(spa)); spa_vdev_state_enter(spa, SCL_ALL); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENOENT)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); if (ispath) { if (strcmp(value, vd->vdev_path) != 0) { spa_strfree(vd->vdev_path); vd->vdev_path = spa_strdup(value); sync = B_TRUE; } } else { if (vd->vdev_fru == NULL) { vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } else if (strcmp(value, vd->vdev_fru) != 0) { spa_strfree(vd->vdev_fru); vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } } return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); } int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) { return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); } int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) { return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); } /* * ========================================================================== * SPA Scanning * ========================================================================== */ int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); } int spa_scan_stop(spa_t *spa) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scan_cancel(spa->spa_dsl_pool)); } int spa_scan(spa_t *spa, pool_scan_func_t func) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) return (SET_ERROR(ENOTSUP)); if (func == POOL_SCAN_RESILVER && !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) return (SET_ERROR(ENOTSUP)); /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. */ if (func == POOL_SCAN_RESILVER && !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); return (0); } return (dsl_scan(spa->spa_dsl_pool, func)); } /* * ========================================================================== * SPA async task processing * ========================================================================== */ static void spa_async_remove(spa_t *spa, vdev_t *vd) { if (vd->vdev_remove_wanted) { vd->vdev_remove_wanted = B_FALSE; vd->vdev_delayed_close = B_FALSE; vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); /* * We want to clear the stats, but we don't want to do a full * vdev_clear() as that will cause us to throw away * degraded/faulted state as well as attempt to reopen the * device, all of which is a waste. */ vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vdev_state_dirty(vd->vdev_top); } for (int c = 0; c < vd->vdev_children; c++) spa_async_remove(spa, vd->vdev_child[c]); } static void spa_async_probe(spa_t *spa, vdev_t *vd) { if (vd->vdev_probe_wanted) { vd->vdev_probe_wanted = B_FALSE; vdev_reopen(vd); /* vdev_open() does the actual probe */ } for (int c = 0; c < vd->vdev_children; c++) spa_async_probe(spa, vd->vdev_child[c]); } static void spa_async_autoexpand(spa_t *spa, vdev_t *vd) { if (!spa->spa_autoexpand) return; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; spa_async_autoexpand(spa, cvd); } if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) return; spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); } static void spa_async_thread(void *arg) { spa_t *spa = (spa_t *)arg; dsl_pool_t *dp = spa->spa_dsl_pool; int tasks; ASSERT(spa->spa_sync_on); mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; spa->spa_async_tasks = 0; mutex_exit(&spa->spa_async_lock); /* * See if the config needs to be updated. */ if (tasks & SPA_ASYNC_CONFIG_UPDATE) { uint64_t old_space, new_space; mutex_enter(&spa_namespace_lock); old_space = metaslab_class_get_space(spa_normal_class(spa)); old_space += metaslab_class_get_space(spa_special_class(spa)); old_space += metaslab_class_get_space(spa_dedup_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); new_space = metaslab_class_get_space(spa_normal_class(spa)); new_space += metaslab_class_get_space(spa_special_class(spa)); new_space += metaslab_class_get_space(spa_dedup_class(spa)); mutex_exit(&spa_namespace_lock); /* * If the pool grew as a result of the config update, * then log an internal history event. */ if (new_space != old_space) { spa_history_log_internal(spa, "vdev online", NULL, "pool '%s' size: %llu(+%llu)", spa_name(spa), (u_longlong_t)new_space, (u_longlong_t)(new_space - old_space)); } } /* * See if any devices need to be marked REMOVED. */ if (tasks & SPA_ASYNC_REMOVE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_remove(spa, spa->spa_root_vdev); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); for (int i = 0; i < spa->spa_spares.sav_count; i++) spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); (void) spa_vdev_state_exit(spa, NULL, 0); } if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_async_autoexpand(spa, spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } /* * See if any devices need to be probed. */ if (tasks & SPA_ASYNC_PROBE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_probe(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } /* * If any devices are done replacing, detach them. */ if (tasks & SPA_ASYNC_RESILVER_DONE) spa_vdev_resilver_done(spa); /* * If any devices are done replacing, detach them. Then if no * top-level vdevs are rebuilding attempt to kick off a scrub. */ if (tasks & SPA_ASYNC_REBUILD_DONE) { spa_vdev_resilver_done(spa); if (!vdev_rebuild_active(spa->spa_root_vdev)) (void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB); } /* * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER && !vdev_rebuild_active(spa->spa_root_vdev) && (!dsl_scan_resilvering(dp) || !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) dsl_scan_restart_resilver(dp, 0); if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_initialize_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } if (tasks & SPA_ASYNC_TRIM_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_autotrim_restart(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } /* * Kick off L2 cache whole device TRIM. */ if (tasks & SPA_ASYNC_L2CACHE_TRIM) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_l2arc(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } /* * Kick off L2 cache rebuilding. */ if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); l2arc_spa_rebuild_start(spa); spa_config_exit(spa, SCL_L2ARC, FTAG); mutex_exit(&spa_namespace_lock); } /* * Let the world know that we're done. */ mutex_enter(&spa->spa_async_lock); spa->spa_async_thread = NULL; cv_broadcast(&spa->spa_async_cv); mutex_exit(&spa->spa_async_lock); thread_exit(); } void spa_async_suspend(spa_t *spa) { mutex_enter(&spa->spa_async_lock); spa->spa_async_suspended++; while (spa->spa_async_thread != NULL) cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); mutex_exit(&spa->spa_async_lock); spa_vdev_remove_suspend(spa); zthr_t *condense_thread = spa->spa_condense_zthr; if (condense_thread != NULL) zthr_cancel(condense_thread); zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_cancel(discard_thread); zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; if (ll_delete_thread != NULL) zthr_cancel(ll_delete_thread); zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; if (ll_condense_thread != NULL) zthr_cancel(ll_condense_thread); } void spa_async_resume(spa_t *spa) { mutex_enter(&spa->spa_async_lock); ASSERT(spa->spa_async_suspended != 0); spa->spa_async_suspended--; mutex_exit(&spa->spa_async_lock); spa_restart_removal(spa); zthr_t *condense_thread = spa->spa_condense_zthr; if (condense_thread != NULL) zthr_resume(condense_thread); zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_resume(discard_thread); zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; if (ll_delete_thread != NULL) zthr_resume(ll_delete_thread); zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; if (ll_condense_thread != NULL) zthr_resume(ll_condense_thread); } static boolean_t spa_async_tasks_pending(spa_t *spa) { uint_t non_config_tasks; uint_t config_task; boolean_t config_task_suspended; non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; if (spa->spa_ccw_fail_time == 0) { config_task_suspended = B_FALSE; } else { config_task_suspended = (gethrtime() - spa->spa_ccw_fail_time) < ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); } return (non_config_tasks || (config_task && !config_task_suspended)); } static void spa_async_dispatch(spa_t *spa) { mutex_enter(&spa->spa_async_lock); if (spa_async_tasks_pending(spa) && !spa->spa_async_suspended && spa->spa_async_thread == NULL) spa->spa_async_thread = thread_create(NULL, 0, spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); } void spa_async_request(spa_t *spa, int task) { zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks |= task; mutex_exit(&spa->spa_async_lock); } int spa_async_tasks(spa_t *spa) { return (spa->spa_async_tasks); } /* * ========================================================================== * SPA syncing routines * ========================================================================== */ static int bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { bpobj_t *bpo = arg; bpobj_enqueue(bpo, bp, bp_freed, tx); return (0); } int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); } int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); } static int spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zio_t *pio = arg; zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, pio->io_flags)); return (0); } static int bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(!bp_freed); return (spa_free_sync_cb(arg, bp, tx)); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing frees. */ static void spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) { zio_t *zio = zio_root(spa, NULL, NULL, 0); bplist_iterate(bpl, spa_free_sync_cb, zio, tx); VERIFY(zio_wait(zio) == 0); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing deferred frees. */ static void spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) { if (spa_sync_pass(spa) != 1) return; /* * Note: * If the log space map feature is active, we stop deferring * frees to the next TXG and therefore running this function * would be considered a no-op as spa_deferred_bpobj should * not have any entries. * * That said we run this function anyway (instead of returning * immediately) for the edge-case scenario where we just * activated the log space map feature in this TXG but we have * deferred frees from the previous TXG. */ zio_t *zio = zio_root(spa, NULL, NULL, 0); VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, bpobj_spa_free_sync_cb, zio, tx), ==, 0); VERIFY0(zio_wait(zio)); } static void spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) { char *packed = NULL; size_t bufsize; size_t nvsize = 0; dmu_buf_t *db; VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); /* * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration * information. This avoids the dmu_buf_will_dirty() path and * saves us a pre-read to get data we don't actually care about. */ bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); packed = vmem_alloc(bufsize, KM_SLEEP); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, KM_SLEEP) == 0); bzero(packed + nvsize, bufsize - nvsize); dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); vmem_free(packed, bufsize); VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); *(uint64_t *)db->db_data = nvsize; dmu_buf_rele(db, FTAG); } static void spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, const char *config, const char *entry) { nvlist_t *nvroot; nvlist_t **list; int i; if (!sav->sav_sync) return; /* * Update the MOS nvlist describing the list of available devices. * spa_validate_aux() will have already made sure this nvlist is * valid and the vdevs are labeled appropriately. */ if (sav->sav_object == 0) { sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); VERIFY(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, &sav->sav_object, tx) == 0); } VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (sav->sav_count == 0) { VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); } else { list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_FALSE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(nvroot, config, list, sav->sav_count) == 0); for (i = 0; i < sav->sav_count; i++) nvlist_free(list[i]); kmem_free(list, sav->sav_count * sizeof (void *)); } spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); nvlist_free(nvroot); sav->sav_sync = B_FALSE; } /* * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. * The all-vdev ZAP must be empty. */ static void spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; if (vd->vdev_top_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_top_zap, tx)); } if (vd->vdev_leaf_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_leaf_zap, tx)); } for (uint64_t i = 0; i < vd->vdev_children; i++) { spa_avz_build(vd->vdev_child[i], avz, tx); } } static void spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) { nvlist_t *config; /* * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, * its config may not be dirty but we still need to build per-vdev ZAPs. * Similarly, if the pool is being assembled (e.g. after a split), we * need to rebuild the AVZ although the config may not be dirty. */ if (list_is_empty(&spa->spa_config_dirty_list) && spa->spa_avz_action == AVZ_ACTION_NONE) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || spa->spa_avz_action == AVZ_ACTION_INITIALIZE || spa->spa_all_vdev_zaps != 0); if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { /* Make and build the new AVZ */ uint64_t new_avz = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); spa_avz_build(spa->spa_root_vdev, new_avz, tx); /* Diff old AVZ with new one */ zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_all_vdev_zaps); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t vdzap = za.za_first_integer; if (zap_lookup_int(spa->spa_meta_objset, new_avz, vdzap) == ENOENT) { /* * ZAP is listed in old AVZ but not in new one; * destroy it */ VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, tx)); } } zap_cursor_fini(&zc); /* Destroy the old AVZ */ VERIFY0(zap_destroy(spa->spa_meta_objset, spa->spa_all_vdev_zaps, tx)); /* Replace the old AVZ in the dir obj with the new one */ VERIFY0(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, sizeof (new_avz), 1, &new_avz, tx)); spa->spa_all_vdev_zaps = new_avz; } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { zap_cursor_t zc; zap_attribute_t za; /* Walk through the AVZ and destroy all listed ZAPs */ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_all_vdev_zaps); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t zap = za.za_first_integer; VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); } zap_cursor_fini(&zc); /* Destroy and unlink the AVZ itself */ VERIFY0(zap_destroy(spa->spa_meta_objset, spa->spa_all_vdev_zaps, tx)); VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); spa->spa_all_vdev_zaps = 0; } if (spa->spa_all_vdev_zaps == 0) { spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx); } spa->spa_avz_action = AVZ_ACTION_NONE; /* Create ZAPs for vdevs that don't have them. */ vdev_construct_zaps(spa->spa_root_vdev, tx); config = spa_config_generate(spa, spa->spa_root_vdev, dmu_tx_get_txg(tx), B_FALSE); /* * If we're upgrading the spa version then make sure that * the config object gets updated with the correct version. */ if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa->spa_uberblock.ub_version); spa_config_exit(spa, SCL_STATE, FTAG); nvlist_free(spa->spa_config_syncing); spa->spa_config_syncing = config; spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } static void spa_sync_version(void *arg, dmu_tx_t *tx) { uint64_t *versionp = arg; uint64_t version = *versionp; spa_t *spa = dmu_tx_pool(tx)->dp_spa; /* * Setting the version is special cased when first creating the pool. */ ASSERT(tx->tx_txg != TXG_INITIAL); ASSERT(SPA_VERSION_IS_SUPPORTED(version)); ASSERT(version >= spa_version(spa)); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_history_log_internal(spa, "set", tx, "version=%lld", (longlong_t)version); } /* * Set zpool properties. */ static void spa_sync_props(void *arg, dmu_tx_t *tx) { nvlist_t *nvp = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvp, elem))) { uint64_t intval; char *strval, *fname; zpool_prop_t prop; const char *propname; zprop_type_t proptype; spa_feature_t fid; switch (prop = zpool_name_to_prop(nvpair_name(elem))) { case ZPOOL_PROP_INVAL: /* * We checked this earlier in spa_prop_validate(). */ ASSERT(zpool_prop_feature(nvpair_name(elem))); fname = strchr(nvpair_name(elem), '@') + 1; VERIFY0(zfeature_lookup_name(fname, &fid)); spa_feature_enable(spa, fid, tx); spa_history_log_internal(spa, "set", tx, "%s=enabled", nvpair_name(elem)); break; case ZPOOL_PROP_VERSION: intval = fnvpair_value_uint64(elem); /* * The version is synced separately before other * properties and should be correct by now. */ ASSERT3U(spa_version(spa), >=, intval); break; case ZPOOL_PROP_ALTROOT: /* * 'altroot' is a non-persistent property. It should * have been set temporarily at creation or import time. */ ASSERT(spa->spa_root != NULL); break; case ZPOOL_PROP_READONLY: case ZPOOL_PROP_CACHEFILE: /* * 'readonly' and 'cachefile' are also non-persistent * properties. */ break; case ZPOOL_PROP_COMMENT: strval = fnvpair_value_string(elem); if (spa->spa_comment != NULL) spa_strfree(spa->spa_comment); spa->spa_comment = spa_strdup(strval); /* * We need to dirty the configuration on all the vdevs * so that their labels get updated. It's unnecessary * to do this for pool creation since the vdev's * configuration has already been dirtied. */ if (tx->tx_txg != TXG_INITIAL) vdev_config_dirty(spa->spa_root_vdev); spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); break; default: /* * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { spa->spa_pool_props_object = zap_create_link(mos, DMU_OT_POOL_PROPS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, tx); } /* normalize the property name */ propname = zpool_prop_to_name(prop); proptype = zpool_prop_get_type(prop); if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY0(zpool_prop_index_to_string( prop, intval, &unused)); } VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 8, 1, &intval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%lld", nvpair_name(elem), (longlong_t)intval); } else { ASSERT(0); /* not allowed */ } switch (prop) { case ZPOOL_PROP_DELEGATION: spa->spa_delegation = intval; break; case ZPOOL_PROP_BOOTFS: spa->spa_bootfs = intval; break; case ZPOOL_PROP_FAILUREMODE: spa->spa_failmode = intval; break; case ZPOOL_PROP_AUTOTRIM: spa->spa_autotrim = intval; spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); break; case ZPOOL_PROP_AUTOEXPAND: spa->spa_autoexpand = intval; if (tx->tx_txg != TXG_INITIAL) spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); break; case ZPOOL_PROP_MULTIHOST: spa->spa_multihost = intval; break; default: break; } } } mutex_exit(&spa->spa_props_lock); } /* * Perform one-time upgrade on-disk changes. spa_version() does not * reflect the new version this txg, so there must be no changes this * txg to anything that the upgrade code depends on after it executes. * Therefore this must be called after dsl_pool_sync() does the sync * tasks. */ static void spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) { if (spa_sync_pass(spa) != 1) return; dsl_pool_t *dp = spa->spa_dsl_pool; rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { dsl_pool_create_origin(dp, tx); /* Keeping the origin open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { dsl_pool_upgrade_clones(dp, tx); } if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { dsl_pool_upgrade_dir_clones(dp, tx); /* Keeping the freedir open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { spa_feature_create_zap_objects(spa, tx); } /* * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable * when possibility to use lz4 compression for metadata was added * Old pools that have this feature enabled must be upgraded to have * this feature active */ if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { boolean_t lz4_en = spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS); boolean_t lz4_ac = spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS); if (lz4_en && !lz4_ac) spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); } /* * If we haven't written the salt, do so now. Note that the * feature may not be activated yet, but that's fine since * the presence of this ZAP entry is backwards compatible. */ if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT) == ENOENT) { VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes, tx)); } rrw_exit(&dp->dp_config_rwlock, FTAG); } static void vdev_indirect_state_sync_verify(vdev_t *vd) { vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(vim != NULL); ASSERT(vib != NULL); } uint64_t obsolete_sm_object = 0; ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object != 0) { ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); ASSERT3U(obsolete_sm_object, ==, space_map_object(vd->vdev_obsolete_sm)); ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, space_map_allocated(vd->vdev_obsolete_sm)); } ASSERT(vd->vdev_obsolete_segments != NULL); /* * Since frees / remaps to an indirect vdev can only * happen in syncing context, the obsolete segments * tree must be empty when we start syncing. */ ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); } /* * Set the top-level vdev's max queue depth. Evaluate each top-level's * async write queue depth in case it changed. The max queue depth will * not change in the middle of syncing out this txg. */ static void spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) { ASSERT(spa_writeable(spa)); vdev_t *rvd = spa->spa_root_vdev; uint32_t max_queue_depth = zfs_vdev_async_write_max_active * zfs_vdev_queue_depth_pct / 100; metaslab_class_t *normal = spa_normal_class(spa); metaslab_class_t *special = spa_special_class(spa); metaslab_class_t *dedup = spa_dedup_class(spa); uint64_t slots_per_allocator = 0; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (mg == NULL || !metaslab_group_initialized(mg)) continue; metaslab_class_t *mc = mg->mg_class; if (mc != normal && mc != special && mc != dedup) continue; /* * It is safe to do a lock-free check here because only async * allocations look at mg_max_alloc_queue_depth, and async * allocations all happen from spa_sync(). */ for (int i = 0; i < mg->mg_allocators; i++) { ASSERT0(zfs_refcount_count( &(mg->mg_allocator[i].mga_alloc_queue_depth))); } mg->mg_max_alloc_queue_depth = max_queue_depth; for (int i = 0; i < mg->mg_allocators; i++) { mg->mg_allocator[i].mga_cur_max_alloc_queue_depth = zfs_vdev_def_queue_depth; } slots_per_allocator += zfs_vdev_def_queue_depth; } for (int i = 0; i < spa->spa_alloc_count; i++) { ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i])); ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i])); ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i])); normal->mc_alloc_max_slots[i] = slots_per_allocator; special->mc_alloc_max_slots[i] = slots_per_allocator; dedup->mc_alloc_max_slots[i] = slots_per_allocator; } normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; } static void spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) { ASSERT(spa_writeable(spa)); vdev_t *rvd = spa->spa_root_vdev; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_indirect_state_sync_verify(vd); if (vdev_indirect_should_condense(vd)) { spa_condense_indirect_start_sync(vd, tx); break; } } } static void spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) { objset_t *mos = spa->spa_meta_objset; dsl_pool_t *dp = spa->spa_dsl_pool; uint64_t txg = tx->tx_txg; bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; do { int pass = ++spa->spa_sync_pass; spa_sync_config_object(spa, tx); spa_sync_aux_dev(spa, &spa->spa_spares, tx, ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); if (pass < zfs_sync_pass_deferred_free || spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { /* * If the log space map feature is active we don't * care about deferred frees and the deferred bpobj * as the log space map should effectively have the * same results (i.e. appending only to one object). */ spa_sync_frees(spa, free_bpl, tx); } else { /* * We can not defer frees in pass 1, because * we sync the deferred frees later in pass 1. */ ASSERT3U(pass, >, 1); bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, &spa->spa_deferred_bpobj, tx); } ddt_sync(spa, txg); dsl_scan_sync(dp, tx); svr_sync(spa, tx); spa_sync_upgrades(spa, tx); spa_flush_metaslabs(spa, tx); vdev_t *vd = NULL; while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) != NULL) vdev_sync(vd, txg); /* * Note: We need to check if the MOS is dirty because we could * have marked the MOS dirty without updating the uberblock * (e.g. if we have sync tasks but no dirty user data). We need * to check the uberblock's rootbp because it is updated if we * have synced out dirty data (though in this case the MOS will * most likely also be dirty due to second order effects, we * don't want to rely on that here). */ if (pass == 1 && spa->spa_uberblock.ub_rootbp.blk_birth < txg && !dmu_objset_is_dirty(mos, txg)) { /* * Nothing changed on the first pass, therefore this * TXG is a no-op. Avoid syncing deferred frees, so * that we can keep this TXG as a no-op. */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); break; } spa_sync_deferred_frees(spa, tx); } while (dmu_objset_is_dirty(mos, txg)); } /* * Rewrite the vdev configuration (which includes the uberblock) to * commit the transaction group. * * If there are no dirty vdevs, we sync the uberblock to a few random * top-level vdevs that are known to be visible in the config cache * (see spa_vdev_add() for a complete description). If there *are* dirty * vdevs, sync the uberblock to all vdevs. */ static void spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg = tx->tx_txg; for (;;) { int error = 0; /* * We hold SCL_STATE to prevent vdev open/close/etc. * while we're attempting to write the vdev labels. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (list_is_empty(&spa->spa_config_dirty_list)) { vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; int c0 = spa_get_random(children); for (int c = 0; c < children; c++) { vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; /* Stop when revisiting the first vdev */ if (c > 0 && svd[0] == vd) break; if (vd->vdev_ms_array == 0 || vd->vdev_islog || !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_SYNC_MIN_VDEVS) break; } error = vdev_config_sync(svd, svdcount, txg); } else { error = vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg); } if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_STATE, FTAG); if (error == 0) break; zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); zio_resume_wait(spa); } } /* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. */ void spa_sync(spa_t *spa, uint64_t txg) { vdev_t *vd = NULL; VERIFY(spa_writeable(spa)); /* * Wait for i/os issued in open context that need to complete * before this txg syncs. */ (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* * Lock out configuration changes. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_enter(&spa->spa_alloc_locks[i]); VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); mutex_exit(&spa->spa_alloc_locks[i]); } /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); while (list_head(&spa->spa_state_dirty_list) != NULL) { /* * We need the write lock here because, for aux vdevs, * calling vdev_config_dirty() modifies sav_config. * This is ugly and will become unnecessary when we * eliminate the aux vdev wart by integrating all vdevs * into the root vdev tree. */ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { vdev_state_clean(vd); vdev_config_dirty(vd); } spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } spa_config_exit(spa, SCL_STATE, FTAG); dsl_pool_t *dp = spa->spa_dsl_pool; dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); spa->spa_sync_starttime = gethrtime(); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + NSEC_TO_TICK(spa->spa_deadman_synctime)); /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. */ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { vdev_t *rvd = spa->spa_root_vdev; int i; for (i = 0; i < rvd->vdev_children; i++) { vd = rvd->vdev_child[i]; if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) break; } if (i == rvd->vdev_children) { spa->spa_deflate = TRUE; VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx)); } } spa_sync_adjust_vdev_max_queue_depth(spa); spa_sync_condense_indirect(spa, tx); spa_sync_iterate_to_convergence(spa, tx); #ifdef ZFS_DEBUG if (!list_is_empty(&spa->spa_config_dirty_list)) { /* * Make sure that the number of ZAPs for all the vdevs matches * the number of ZAPs in the per-vdev ZAP list. This only gets * called if the config is dirty; otherwise there may be * outstanding AVZ operations that weren't completed in * spa_sync_config_object. */ uint64_t all_vdev_zap_entry_count; ASSERT0(zap_count(spa->spa_meta_objset, spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, all_vdev_zap_entry_count); } #endif if (spa->spa_vdev_removal != NULL) { ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); } spa_sync_rewrite_vdev_config(spa, tx); dmu_tx_commit(tx); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); spa->spa_deadman_tqid = 0; /* * Clear the dirty config list. */ while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) vdev_config_clean(vd); /* * Now that the new config has synced transactionally, * let it become visible to the config cache. */ if (spa->spa_config_syncing != NULL) { spa_config_set(spa, spa->spa_config_syncing); spa->spa_config_txg = txg; spa->spa_config_syncing = NULL; } dsl_pool_sync_done(dp, txg); for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_enter(&spa->spa_alloc_locks[i]); VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); mutex_exit(&spa->spa_alloc_locks[i]); } /* * Update usable space statistics. */ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) != NULL) vdev_sync_done(vd, txg); metaslab_class_evict_old(spa->spa_normal_class, txg); metaslab_class_evict_old(spa->spa_log_class, txg); spa_sync_close_syncing_log_sm(spa); spa_update_dspace(spa); /* * It had better be the case that we didn't dirty anything * since vdev_config_sync(). */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); while (zfs_pause_spa_sync) delay(1); spa->spa_sync_pass = 0; /* * Update the last synced uberblock here. We want to do this at * the end of spa_sync() so that consumers of spa_last_synced_txg() * will be guaranteed that all the processing associated with * that txg has been completed. */ spa->spa_ubsync = spa->spa_uberblock; spa_config_exit(spa, SCL_CONFIG, FTAG); spa_handle_ignored_writes(spa); /* * If any async tasks have been requested, kick them off. */ spa_async_dispatch(spa); } /* * Sync all pools. We don't want to hold the namespace lock across these * operations, so we take a reference on the spa_t and drop the lock during the * sync. */ void spa_sync_allpools(void) { spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (spa_state(spa) != POOL_STATE_ACTIVE || !spa_writeable(spa) || spa_suspended(spa)) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); } mutex_exit(&spa_namespace_lock); } /* * ========================================================================== * Miscellaneous routines * ========================================================================== */ /* * Remove all pools in the system. */ void spa_evict_all(void) { spa_t *spa; /* * Remove all cached state. All pools should be closed now, * so every spa in the AVL tree should be unreferenced. */ mutex_enter(&spa_namespace_lock); while ((spa = spa_next(NULL)) != NULL) { /* * Stop async tasks. The async thread may need to detach * a device that's been replaced, which requires grabbing * spa_namespace_lock, so we must drop it here. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } spa_remove(spa); } mutex_exit(&spa_namespace_lock); } vdev_t * spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) { vdev_t *vd; int i; if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) return (vd); if (aux) { for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vd = spa->spa_l2cache.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } } return (NULL); } void spa_upgrade(spa_t *spa, uint64_t version) { ASSERT(spa_writeable(spa)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * This should only be called for a non-faulted pool, and since a * future version would result in an unopenable pool, this shouldn't be * possible. */ ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); ASSERT3U(version, >=, spa->spa_uberblock.ub_version); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); txg_wait_synced(spa_get_dsl(spa), 0); } boolean_t spa_has_spare(spa_t *spa, uint64_t guid) { int i; uint64_t spareguid; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) if (sav->sav_vdevs[i]->vdev_guid == guid) return (B_TRUE); for (i = 0; i < sav->sav_npending; i++) { if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, &spareguid) == 0 && spareguid == guid) return (B_TRUE); } return (B_FALSE); } /* * Check if a pool has an active shared spare device. * Note: reference count of an active spare is 2, as a spare and as a replace */ static boolean_t spa_has_active_shared_spare(spa_t *spa) { int i, refcnt; uint64_t pool; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) { if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, &refcnt) && pool != 0ULL && pool == spa_guid(spa) && refcnt > 2) return (B_TRUE); } return (B_FALSE); } uint64_t spa_total_metaslabs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; uint64_t m = 0; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; if (!vdev_is_concrete(vd)) continue; m += vd->vdev_ms_count; } return (m); } /* * Notify any waiting threads that some activity has switched from being in- * progress to not-in-progress so that the thread can wake up and determine * whether it is finished waiting. */ void spa_notify_waiters(spa_t *spa) { /* * Acquiring spa_activities_lock here prevents the cv_broadcast from * happening between the waiting thread's check and cv_wait. */ mutex_enter(&spa->spa_activities_lock); cv_broadcast(&spa->spa_activities_cv); mutex_exit(&spa->spa_activities_lock); } /* * Notify any waiting threads that the pool is exporting, and then block until * they are finished using the spa_t. */ void spa_wake_waiters(spa_t *spa) { mutex_enter(&spa->spa_activities_lock); spa->spa_waiters_cancel = B_TRUE; cv_broadcast(&spa->spa_activities_cv); while (spa->spa_waiters != 0) cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); spa->spa_waiters_cancel = B_FALSE; mutex_exit(&spa->spa_activities_lock); } /* Whether the vdev or any of its descendants are being initialized/trimmed. */ static boolean_t spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); ASSERT(activity == ZPOOL_WAIT_INITIALIZE || activity == ZPOOL_WAIT_TRIM); kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? &vd->vdev_initialize_lock : &vd->vdev_trim_lock; mutex_exit(&spa->spa_activities_lock); mutex_enter(lock); mutex_enter(&spa->spa_activities_lock); boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); mutex_exit(lock); if (in_progress) return (B_TRUE); for (int i = 0; i < vd->vdev_children; i++) { if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], activity)) return (B_TRUE); } return (B_FALSE); } /* * If use_guid is true, this checks whether the vdev specified by guid is * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool * is being initialized/trimmed. The caller must hold the config lock and * spa_activities_lock. */ static int spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, zpool_wait_activity_t activity, boolean_t *in_progress) { mutex_exit(&spa->spa_activities_lock); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); mutex_enter(&spa->spa_activities_lock); vdev_t *vd; if (use_guid) { vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (EINVAL); } } else { vd = spa->spa_root_vdev; } *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (0); } /* * Locking for waiting threads * --------------------------- * * Waiting threads need a way to check whether a given activity is in progress, * and then, if it is, wait for it to complete. Each activity will have some * in-memory representation of the relevant on-disk state which can be used to * determine whether or not the activity is in progress. The in-memory state and * the locking used to protect it will be different for each activity, and may * not be suitable for use with a cvar (e.g., some state is protected by the * config lock). To allow waiting threads to wait without any races, another * lock, spa_activities_lock, is used. * * When the state is checked, both the activity-specific lock (if there is one) * and spa_activities_lock are held. In some cases, the activity-specific lock * is acquired explicitly (e.g. the config lock). In others, the locking is * internal to some check (e.g. bpobj_is_empty). After checking, the waiting * thread releases the activity-specific lock and, if the activity is in * progress, then cv_waits using spa_activities_lock. * * The waiting thread is woken when another thread, one completing some * activity, updates the state of the activity and then calls * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only * needs to hold its activity-specific lock when updating the state, and this * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. * * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, * and because it is held when the waiting thread checks the state of the * activity, it can never be the case that the completing thread both updates * the activity state and cv_broadcasts in between the waiting thread's check * and cv_wait. Thus, a waiting thread can never miss a wakeup. * * In order to prevent deadlock, when the waiting thread does its check, in some * cases it will temporarily drop spa_activities_lock in order to acquire the * activity-specific lock. The order in which spa_activities_lock and the * activity specific lock are acquired in the waiting thread is determined by * the order in which they are acquired in the completing thread; if the * completing thread calls spa_notify_waiters with the activity-specific lock * held, then the waiting thread must also acquire the activity-specific lock * first. */ static int spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, boolean_t use_tag, uint64_t tag, boolean_t *in_progress) { int error = 0; ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); switch (activity) { case ZPOOL_WAIT_CKPT_DISCARD: *in_progress = (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == ENOENT); break; case ZPOOL_WAIT_FREE: *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || spa_livelist_delete_check(spa)); break; case ZPOOL_WAIT_INITIALIZE: case ZPOOL_WAIT_TRIM: error = spa_vdev_activity_in_progress(spa, use_tag, tag, activity, in_progress); break; case ZPOOL_WAIT_REPLACE: mutex_exit(&spa->spa_activities_lock); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); mutex_enter(&spa->spa_activities_lock); *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); break; case ZPOOL_WAIT_REMOVE: *in_progress = (spa->spa_removing_phys.sr_state == DSS_SCANNING); break; case ZPOOL_WAIT_RESILVER: if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) break; /* fall through */ case ZPOOL_WAIT_SCRUB: { boolean_t scanning, paused, is_scrub; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); scanning = (scn->scn_phys.scn_state == DSS_SCANNING); paused = dsl_scan_is_paused_scrub(scn); *in_progress = (scanning && !paused && is_scrub == (activity == ZPOOL_WAIT_SCRUB)); break; } default: panic("unrecognized value for activity %d", activity); } return (error); } static int spa_wait_common(const char *pool, zpool_wait_activity_t activity, boolean_t use_tag, uint64_t tag, boolean_t *waited) { /* * The tag is used to distinguish between instances of an activity. * 'initialize' and 'trim' are the only activities that we use this for. * The other activities can only have a single instance in progress in a * pool at one time, making the tag unnecessary. * * There can be multiple devices being replaced at once, but since they * all finish once resilvering finishes, we don't bother keeping track * of them individually, we just wait for them all to finish. */ if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && activity != ZPOOL_WAIT_TRIM) return (EINVAL); if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) return (EINVAL); spa_t *spa; int error = spa_open(pool, &spa, FTAG); if (error != 0) return (error); /* * Increment the spa's waiter count so that we can call spa_close and * still ensure that the spa_t doesn't get freed before this thread is * finished with it when the pool is exported. We want to call spa_close * before we start waiting because otherwise the additional ref would * prevent the pool from being exported or destroyed throughout the * potentially long wait. */ mutex_enter(&spa->spa_activities_lock); spa->spa_waiters++; spa_close(spa, FTAG); *waited = B_FALSE; for (;;) { boolean_t in_progress; error = spa_activity_in_progress(spa, activity, use_tag, tag, &in_progress); if (error || !in_progress || spa->spa_waiters_cancel) break; *waited = B_TRUE; if (cv_wait_sig(&spa->spa_activities_cv, &spa->spa_activities_lock) == 0) { error = EINTR; break; } } spa->spa_waiters--; cv_signal(&spa->spa_waiters_cv); mutex_exit(&spa->spa_activities_lock); return (error); } /* * Wait for a particular instance of the specified activity to complete, where * the instance is identified by 'tag' */ int spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, boolean_t *waited) { return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); } /* * Wait for all instances of the specified activity complete */ int spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) { return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); } sysevent_t * spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { sysevent_t *ev = NULL; #ifdef _KERNEL nvlist_t *resource; resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); if (resource) { ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); ev->resource = resource; } #endif return (ev); } void spa_event_post(sysevent_t *ev) { #ifdef _KERNEL if (ev) { zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); kmem_free(ev, sizeof (*ev)); } #endif } /* * Post a zevent corresponding to the given sysevent. The 'name' must be one * of the event definitions in sys/sysevent/eventdefs.h. The payload will be * filled in from the spa and (optionally) the vdev. This doesn't do anything * in the userland libzpool, as we don't want consumers to misinterpret ztest * or zdb as real changes. */ void spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); } /* state manipulation functions */ EXPORT_SYMBOL(spa_open); EXPORT_SYMBOL(spa_open_rewind); EXPORT_SYMBOL(spa_get_stats); EXPORT_SYMBOL(spa_create); EXPORT_SYMBOL(spa_import); EXPORT_SYMBOL(spa_tryimport); EXPORT_SYMBOL(spa_destroy); EXPORT_SYMBOL(spa_export); EXPORT_SYMBOL(spa_reset); EXPORT_SYMBOL(spa_async_request); EXPORT_SYMBOL(spa_async_suspend); EXPORT_SYMBOL(spa_async_resume); EXPORT_SYMBOL(spa_inject_addref); EXPORT_SYMBOL(spa_inject_delref); EXPORT_SYMBOL(spa_scan_stat_init); EXPORT_SYMBOL(spa_scan_get_stats); /* device manipulation */ EXPORT_SYMBOL(spa_vdev_add); EXPORT_SYMBOL(spa_vdev_attach); EXPORT_SYMBOL(spa_vdev_detach); EXPORT_SYMBOL(spa_vdev_setpath); EXPORT_SYMBOL(spa_vdev_setfru); EXPORT_SYMBOL(spa_vdev_split_mirror); /* spare statech is global across all pools) */ EXPORT_SYMBOL(spa_spare_add); EXPORT_SYMBOL(spa_spare_remove); EXPORT_SYMBOL(spa_spare_exists); EXPORT_SYMBOL(spa_spare_activate); /* L2ARC statech is global across all pools) */ EXPORT_SYMBOL(spa_l2cache_add); EXPORT_SYMBOL(spa_l2cache_remove); EXPORT_SYMBOL(spa_l2cache_exists); EXPORT_SYMBOL(spa_l2cache_activate); EXPORT_SYMBOL(spa_l2cache_drop); /* scanning */ EXPORT_SYMBOL(spa_scan); EXPORT_SYMBOL(spa_scan_stop); /* spa syncing */ EXPORT_SYMBOL(spa_sync); /* only for DMU use */ EXPORT_SYMBOL(spa_sync_allpools); /* properties */ EXPORT_SYMBOL(spa_prop_set); EXPORT_SYMBOL(spa_prop_get); EXPORT_SYMBOL(spa_prop_clear_bootfs); /* asynchronous event notification */ EXPORT_SYMBOL(spa_event_notify); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, INT, ZMOD_RW, "log2(fraction of arc that can be used by inflight I/Os when " "verifying pool during import"); ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, "Set to traverse metadata on pool import"); ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, "Set to traverse data on pool import"); ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, "Print vdev tree to zfs_dbgmsg during pool import"); ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, "Percentage of CPUs to run an IO worker thread"); ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW, "Allow importing pool with up to this number of missing top-level " "vdevs (in read-only mode)"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW, "Set the livelist condense zthr to pause"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, ZMOD_RW, "Set the livelist condense synctask to pause"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, INT, ZMOD_RW, "Whether livelist condensing was canceled in the synctask"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT, ZMOD_RW, "Whether livelist condensing was canceled in the zthr function"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); /* END CSTYLED */ Index: vendor-sys/openzfs/dist/module/zfs/spa_config.c =================================================================== --- vendor-sys/openzfs/dist/module/zfs/spa_config.c (revision 365339) +++ vendor-sys/openzfs/dist/module/zfs/spa_config.c (revision 365340) @@ -1,620 +1,621 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif /* * Pool configuration repository. * * Pool configuration is stored as a packed nvlist on the filesystem. By * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot * (when the ZFS module is loaded). Pools can also have the 'cachefile' * property set that allows them to be stored in an alternate location until * the control of external software. * * For each cache file, we have a single nvlist which holds all the * configuration information. When the module loads, we read this information * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is * maintained independently in spa.c. Whenever the namespace is modified, or * the configuration of a pool is changed, we call spa_write_cachefile(), which * walks through all the active pools and writes the configuration to disk. */ static uint64_t spa_config_generation = 1; /* * This can be overridden in userland to preserve an alternate namespace for * userland pools when doing testing. */ char *spa_config_path = ZPOOL_CACHE; int zfs_autoimport_disable = 1; /* * Called when the module is first loaded, this routine loads the configuration * file into the SPA namespace. It does not actually open or load the pools; it * only populates the namespace. */ void spa_config_load(void) { void *buf = NULL; nvlist_t *nvlist, *child; nvpair_t *nvpair; char *pathname; zfs_file_t *fp; zfs_file_attr_t zfa; uint64_t fsize; int err; #ifdef _KERNEL if (zfs_autoimport_disable) return; #endif /* * Open the configuration file. */ pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path); err = zfs_file_open(pathname, O_RDONLY, 0, &fp); #ifdef __FreeBSD__ if (err) err = zfs_file_open(ZPOOL_CACHE_BOOT, O_RDONLY, 0, &fp); #endif kmem_free(pathname, MAXPATHLEN); if (err) return; if (zfs_file_getattr(fp, &zfa)) goto out; fsize = zfa.zfa_size; buf = kmem_alloc(fsize, KM_SLEEP); /* * Read the nvlist from the file. */ if (zfs_file_read(fp, buf, fsize, NULL) < 0) goto out; /* * Unpack the nvlist. */ if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) goto out; /* * Iterate over all elements in the nvlist, creating a new spa_t for * each one with the specified configuration. */ mutex_enter(&spa_namespace_lock); nvpair = NULL; while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) continue; child = fnvpair_value_nvlist(nvpair); if (spa_lookup(nvpair_name(nvpair)) != NULL) continue; (void) spa_add(nvpair_name(nvpair), child, NULL); } mutex_exit(&spa_namespace_lock); nvlist_free(nvlist); out: if (buf != NULL) kmem_free(buf, fsize); zfs_file_close(fp); } static int spa_config_remove(spa_config_dirent_t *dp) { int error = 0; /* * Remove the cache file. If zfs_file_unlink() in not supported by the * platform fallback to truncating the file which is functionally * equivalent. */ error = zfs_file_unlink(dp->scd_path); if (error == EOPNOTSUPP) { int flags = O_RDWR | O_TRUNC; zfs_file_t *fp; error = zfs_file_open(dp->scd_path, flags, 0644, &fp); if (error == 0) { (void) zfs_file_fsync(fp, O_SYNC); (void) zfs_file_close(fp); } } return (error); } static int spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) { size_t buflen; char *buf; int oflags = O_RDWR | O_TRUNC | O_CREAT | O_LARGEFILE; char *temp; int err; zfs_file_t *fp; /* * If the nvlist is empty (NULL), then remove the old cachefile. */ if (nvl == NULL) { err = spa_config_remove(dp); if (err == ENOENT) err = 0; return (err); } /* * Pack the configuration into a buffer. */ buf = fnvlist_pack(nvl, &buflen); temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP); /* * Write the configuration to disk. Due to the complexity involved * in performing a rename and remove from within the kernel the file * is instead truncated and overwritten in place. This way we always * have a consistent view of the data or a zero length file. */ err = zfs_file_open(dp->scd_path, oflags, 0644, &fp); if (err == 0) { err = zfs_file_write(fp, buf, buflen, NULL); if (err == 0) err = zfs_file_fsync(fp, O_SYNC); zfs_file_close(fp); if (err) (void) spa_config_remove(dp); } fnvlist_pack_free(buf, buflen); kmem_free(temp, MAXPATHLEN); return (err); } /* * Synchronize pool configuration to disk. This must be called with the * namespace lock held. Synchronizing the pool cache is typically done after * the configuration has been synced to the MOS. This exposes a window where * the MOS config will have been updated but the cache file has not. If * the system were to crash at that instant then the cached config may not * contain the correct information to open the pool and an explicit import * would be required. */ void spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) { spa_config_dirent_t *dp, *tdp; nvlist_t *nvl; char *pool_name; boolean_t ccw_failure; int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (!(spa_mode_global & SPA_MODE_WRITE)) return; /* * Iterate over all cachefiles for the pool, past or present. When the * cachefile is changed, the new one is pushed onto this list, allowing * us to update previous cachefiles that no longer contain this pool. */ ccw_failure = B_FALSE; for (dp = list_head(&target->spa_config_list); dp != NULL; dp = list_next(&target->spa_config_list, dp)) { spa_t *spa = NULL; if (dp->scd_path == NULL) continue; /* * Iterate over all pools, adding any matching pools to 'nvl'. */ nvl = NULL; while ((spa = spa_next(spa)) != NULL) { /* * Skip over our own pool if we're about to remove * ourselves from the spa namespace or any pool that * is readonly. Since we cannot guarantee that a * readonly pool would successfully import upon reboot, * we don't allow them to be written to the cache file. */ if ((spa == target && removing) || !spa_writeable(spa)) continue; mutex_enter(&spa->spa_props_lock); tdp = list_head(&spa->spa_config_list); if (spa->spa_config == NULL || tdp == NULL || tdp->scd_path == NULL || strcmp(tdp->scd_path, dp->scd_path) != 0) { mutex_exit(&spa->spa_props_lock); continue; } if (nvl == NULL) nvl = fnvlist_alloc(); if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) pool_name = fnvlist_lookup_string( spa->spa_config, ZPOOL_CONFIG_POOL_NAME); else pool_name = spa_name(spa); fnvlist_add_nvlist(nvl, pool_name, spa->spa_config); mutex_exit(&spa->spa_props_lock); } error = spa_config_write(dp, nvl); if (error != 0) ccw_failure = B_TRUE; nvlist_free(nvl); } if (ccw_failure) { /* * Keep trying so that configuration data is * written if/when any temporary filesystem * resource issues are resolved. */ if (target->spa_ccw_fail_time == 0) { - zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, + (void) zfs_ereport_post( + FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, target, NULL, NULL, NULL, 0, 0); } target->spa_ccw_fail_time = gethrtime(); spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); } else { /* * Do not rate limit future attempts to update * the config cache. */ target->spa_ccw_fail_time = 0; } /* * Remove any config entries older than the current one. */ dp = list_head(&target->spa_config_list); while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) { list_remove(&target->spa_config_list, tdp); if (tdp->scd_path != NULL) spa_strfree(tdp->scd_path); kmem_free(tdp, sizeof (spa_config_dirent_t)); } spa_config_generation++; if (postsysevent) spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC); } /* * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache, * and we don't want to allow the local zone to see all the pools anyway. * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration * information for all pool visible within the zone. */ nvlist_t * spa_all_configs(uint64_t *generation) { nvlist_t *pools; spa_t *spa = NULL; if (*generation == spa_config_generation) return (NULL); pools = fnvlist_alloc(); mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (INGLOBALZONE(curproc) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); fnvlist_add_nvlist(pools, spa_name(spa), spa->spa_config); mutex_exit(&spa->spa_props_lock); } } *generation = spa_config_generation; mutex_exit(&spa_namespace_lock); return (pools); } void spa_config_set(spa_t *spa, nvlist_t *config) { mutex_enter(&spa->spa_props_lock); if (spa->spa_config != NULL && spa->spa_config != config) nvlist_free(spa->spa_config); spa->spa_config = config; mutex_exit(&spa->spa_props_lock); } /* * Generate the pool's configuration based on the current in-core state. * * We infer whether to generate a complete config or just one top-level config * based on whether vd is the root vdev. */ nvlist_t * spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) { nvlist_t *config, *nvroot; vdev_t *rvd = spa->spa_root_vdev; unsigned long hostid = 0; boolean_t locked = B_FALSE; uint64_t split_guid; char *pool_name; if (vd == NULL) { vd = rvd; locked = B_TRUE; spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == (SCL_CONFIG | SCL_STATE)); /* * If txg is -1, report the current value of spa->spa_config_txg. */ if (txg == -1ULL) txg = spa->spa_config_txg; /* * Originally, users had to handle spa namespace collisions by either * exporting the already imported pool or by specifying a new name for * the pool with a conflicting name. In the case of root pools from * virtual guests, neither approach to collision resolution is * reasonable. This is addressed by extending the new name syntax with * an option to specify that the new name is temporary. When specified, * ZFS_IMPORT_TEMP_NAME will be set in spa->spa_import_flags to tell us * to use the previous name, which we do below. */ if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) { VERIFY0(nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME, &pool_name)); } else pool_name = spa_name(spa); config = fnvlist_alloc(); fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, pool_name); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata); if (spa->spa_comment != NULL) fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, spa->spa_comment); hostid = spa_get_hostid(spa); if (hostid != 0) fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid); fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname()->nodename); int config_gen_flags = 0; if (vd != rvd) { fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid); fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID, vd->vdev_guid); if (vd->vdev_isspare) fnvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE, 1ULL); if (vd->vdev_islog) fnvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG, 1ULL); vd = vd->vdev_top; /* label contains top config */ } else { /* * Only add the (potentially large) split information * in the mos config, and not in the vdev labels */ if (spa->spa_config_splitting != NULL) fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT, spa->spa_config_splitting); fnvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS); config_gen_flags |= VDEV_CONFIG_MOS; } /* * Add the top-level config. We even add this on pools which * don't support holes in the namespace. */ vdev_top_config_generate(spa, config); /* * If we're splitting, record the original pool's guid. */ if (spa->spa_config_splitting != NULL && nvlist_lookup_uint64(spa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) { fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, split_guid); } nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags); fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot); nvlist_free(nvroot); /* * Store what's necessary for reading the MOS in the label. */ fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, spa->spa_label_features); if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); ddt_get_dedup_histogram(spa, ddh); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)); kmem_free(ddh, sizeof (ddt_histogram_t)); ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP); ddt_get_dedup_object_stats(spa, ddo); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)); kmem_free(ddo, sizeof (ddt_object_t)); dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP); ddt_get_dedup_stats(spa, dds); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)); kmem_free(dds, sizeof (ddt_stat_t)); } if (locked) spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (config); } /* * Update all disk labels, generate a fresh config based on the current * in-core state, and sync the global config cache (do not sync the config * cache if this is a booting rootpool). */ void spa_config_update(spa_t *spa, int what) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg; int c; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); txg = spa_last_synced_txg(spa) + 1; if (what == SPA_CONFIG_UPDATE_POOL) { vdev_config_dirty(rvd); } else { /* * If we have top-level vdevs that were added but have * not yet been prepared for allocation, do that now. * (It's safe now because the config cache is up to date, * so it will be able to translate the new DVAs.) * See comments in spa_vdev_add() for full details. */ for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; /* * Explicitly skip vdevs that are indirect or * log vdevs that are being removed. The reason * is that both of those can have vdev_ms_array * set to 0 and we wouldn't want to change their * metaslab size nor call vdev_expand() on them. */ if (!vdev_is_concrete(tvd) || (tvd->vdev_islog && tvd->vdev_removing)) continue; if (tvd->vdev_ms_array == 0) { vdev_ashift_optimize(tvd); vdev_metaslab_set_size(tvd); } vdev_expand(tvd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); /* * Wait for the mosconfig to be regenerated and synced. */ txg_wait_synced(spa->spa_dsl_pool, txg); /* * Update the global config cache to reflect the new mosconfig. */ if (!spa->spa_is_root) { spa_write_cachefile(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); } if (what == SPA_CONFIG_UPDATE_POOL) spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); } EXPORT_SYMBOL(spa_config_load); EXPORT_SYMBOL(spa_all_configs); EXPORT_SYMBOL(spa_config_set); EXPORT_SYMBOL(spa_config_generate); EXPORT_SYMBOL(spa_config_update); /* BEGIN CSTYLED */ #ifdef __linux__ /* string sysctls require a char array on FreeBSD */ ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD, "SPA config file (/etc/zfs/zpool.cache)"); #endif ZFS_MODULE_PARAM(zfs, zfs_, autoimport_disable, INT, ZMOD_RW, "Disable pool import at module load"); /* END CSTYLED */ Index: vendor-sys/openzfs/dist/module/zfs/vdev.c =================================================================== --- vendor-sys/openzfs/dist/module/zfs/vdev.c (revision 365339) +++ vendor-sys/openzfs/dist/module/zfs/vdev.c (revision 365340) @@ -1,5090 +1,5091 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* default target for number of metaslabs per top-level vdev */ int zfs_vdev_default_ms_count = 200; /* minimum number of metaslabs per top-level vdev */ int zfs_vdev_min_ms_count = 16; /* practical upper limit of total metaslabs per top-level vdev */ int zfs_vdev_ms_count_limit = 1ULL << 17; /* lower limit for metaslab size (512M) */ int zfs_vdev_default_ms_shift = 29; /* upper limit for metaslab size (16G) */ int zfs_vdev_max_ms_shift = 34; int vdev_validate_skip = B_FALSE; /* * Since the DTL space map of a vdev is not expected to have a lot of * entries, we default its block size to 4K. */ int zfs_vdev_dtl_sm_blksz = (1 << 12); /* * Rate limit slow IO (delay) events to this many per second. */ unsigned int zfs_slow_io_events_per_second = 20; /* * Rate limit checksum events after this many checksum errors per second. */ unsigned int zfs_checksum_events_per_second = 20; /* * Ignore errors during scrub/resilver. Allows to work around resilver * upon import when there are pool errors. */ int zfs_scan_ignore_errors = 0; /* * vdev-wide space maps that have lots of entries written to them at * the end of each transaction can benefit from a higher I/O bandwidth * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. */ int zfs_vdev_standard_sm_blksz = (1 << 17); /* * Tunable parameter for debugging or performance analysis. Setting this * will cause pool corruption on power loss if a volatile out-of-order * write cache is enabled. */ int zfs_nocacheflush = 0; uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX; uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; /*PRINTFLIKE2*/ void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); if (vd->vdev_path != NULL) { zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, vd->vdev_path, buf); } else { zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid, buf); } } void vdev_dbgmsg_print_tree(vdev_t *vd, int indent) { char state[20]; if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id, vd->vdev_ops->vdev_op_type); return; } switch (vd->vdev_state) { case VDEV_STATE_UNKNOWN: (void) snprintf(state, sizeof (state), "unknown"); break; case VDEV_STATE_CLOSED: (void) snprintf(state, sizeof (state), "closed"); break; case VDEV_STATE_OFFLINE: (void) snprintf(state, sizeof (state), "offline"); break; case VDEV_STATE_REMOVED: (void) snprintf(state, sizeof (state), "removed"); break; case VDEV_STATE_CANT_OPEN: (void) snprintf(state, sizeof (state), "can't open"); break; case VDEV_STATE_FAULTED: (void) snprintf(state, sizeof (state), "faulted"); break; case VDEV_STATE_DEGRADED: (void) snprintf(state, sizeof (state), "degraded"); break; case VDEV_STATE_HEALTHY: (void) snprintf(state, sizeof (state), "healthy"); break; default: (void) snprintf(state, sizeof (state), "", (uint_t)vd->vdev_state); } zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, vd->vdev_islog ? " (log)" : "", (u_longlong_t)vd->vdev_guid, vd->vdev_path ? vd->vdev_path : "N/A", state); for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); } /* * Virtual device management. */ static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, &vdev_disk_ops, &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, &vdev_indirect_ops, NULL }; /* * Given a vdev type, return the appropriate ops vector. */ static vdev_ops_t * vdev_getops(const char *type) { vdev_ops_t *ops, **opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) break; return (ops); } /* ARGSUSED */ void vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res) { res->rs_start = in->rs_start; res->rs_end = in->rs_end; } /* * Derive the enumerated allocation bias from string input. * String origin is either the per-vdev zap or zpool(1M). */ static vdev_alloc_bias_t vdev_derive_alloc_bias(const char *bias) { vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) alloc_bias = VDEV_BIAS_LOG; else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) alloc_bias = VDEV_BIAS_SPECIAL; else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) alloc_bias = VDEV_BIAS_DEDUP; return (alloc_bias); } /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize(vd->vdev_child[c], psize); asize = MAX(asize, csize); } return (asize); } /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to * replace or attach devices which don't have the same physical size but * can still satisfy the same number of allocations. */ uint64_t vdev_get_min_asize(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; /* * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) return (vd->vdev_asize); /* * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); /* * The allocatable space for a raidz vdev is N * sizeof(smallest child), * so each child must provide at least 1/Nth of its asize. */ if (pvd->vdev_ops == &vdev_raidz_ops) return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / pvd->vdev_children); return (pvd->vdev_min_asize); } void vdev_set_min_asize(vdev_t *vd) { vd->vdev_min_asize = vdev_get_min_asize(vd); for (int c = 0; c < vd->vdev_children; c++) vdev_set_min_asize(vd->vdev_child[c]); } vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (vdev < rvd->vdev_children) { ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); } return (NULL); } vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); return (NULL); } static int vdev_count_leaves_impl(vdev_t *vd) { int n = 0; if (vd->vdev_ops->vdev_op_leaf) return (1); for (int c = 0; c < vd->vdev_children; c++) n += vdev_count_leaves_impl(vd->vdev_child[c]); return (n); } int vdev_count_leaves(spa_t *spa) { int rc; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); rc = vdev_count_leaves_impl(spa->spa_root_vdev); spa_config_exit(spa, SCL_VDEV, FTAG); return (rc); } void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; if (pvd == NULL) return; ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); oldsize = pvd->vdev_children * sizeof (vdev_t *); pvd->vdev_children = MAX(pvd->vdev_children, id + 1); newsize = pvd->vdev_children * sizeof (vdev_t *); newchild = kmem_alloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { bcopy(pvd->vdev_child, newchild, oldsize); kmem_free(pvd->vdev_child, oldsize); } pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; if (cvd->vdev_ops->vdev_op_leaf) { list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd); cvd->vdev_spa->spa_leaf_list_gen++; } } void vdev_remove_child(vdev_t *pvd, vdev_t *cvd) { int c; uint_t id = cvd->vdev_id; ASSERT(cvd->vdev_parent == pvd); if (pvd == NULL) return; ASSERT(id < pvd->vdev_children); ASSERT(pvd->vdev_child[id] == cvd); pvd->vdev_child[id] = NULL; cvd->vdev_parent = NULL; for (c = 0; c < pvd->vdev_children; c++) if (pvd->vdev_child[c]) break; if (c == pvd->vdev_children) { kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); pvd->vdev_child = NULL; pvd->vdev_children = 0; } if (cvd->vdev_ops->vdev_op_leaf) { spa_t *spa = cvd->vdev_spa; list_remove(&spa->spa_leaf_list, cvd); spa->spa_leaf_list_gen++; } /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; } /* * Remove any holes in the child array. */ void vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (oldc == 0) return; for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; if (newc > 0) { newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP); for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; } } } else { newchild = NULL; } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); pvd->vdev_child = newchild; pvd->vdev_children = newc; } /* * Allocate and minimally initialize a vdev_t. */ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; vdev_indirect_config_t *vic; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); vic = &vd->vdev_indirect_config; if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; spa->spa_load_guid = spa_generate_guid(NULL); } if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } vd->vdev_spa = spa; vd->vdev_id = id; vd->vdev_guid = guid; vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); vic->vic_prev_indirect_vdev = UINT64_MAX; rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); /* * Initialize rate limit structs for events. We rate limit ZIO delay * and checksum events so that we don't overwhelm ZED with thousands * of events when a disk is acting up. */ zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksum_events_per_second, 1); list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); list_link_init(&vd->vdev_initialize_node); list_link_init(&vd->vdev_leaf_node); list_link_init(&vd->vdev_trim_node); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); vdev_cache_init(vd); return (vd); } /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. */ int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; char *type; uint64_t guid = 0, islog, nparity; vdev_t *vd; vdev_indirect_config_t *vic; char *tmp = NULL; int rc; vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; boolean_t top_level = (parent && !parent->vdev_parent); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. * Otherwise, vdev_alloc_common() will generate one for us. */ if (alloctype == VDEV_ALLOC_LOAD) { uint64_t label_id; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. */ islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); /* * Set the nparity property for RAID-Z vdevs. */ nparity = -1ULL; if (ops == &vdev_raidz_ops) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (SET_ERROR(EINVAL)); /* * Previous versions could only support 1 or 2 parity * device. */ if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) return (SET_ERROR(ENOTSUP)); if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) return (SET_ERROR(ENOTSUP)); } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ if (spa_version(spa) >= SPA_VERSION_RAIDZ2) return (SET_ERROR(EINVAL)); /* * Otherwise, we default to 1 parity device for RAID-Z. */ nparity = 1; } } else { nparity = 0; } ASSERT(nparity != -1ULL); /* * If creating a top-level vdev, check for allocation classes input */ if (top_level && alloctype == VDEV_ALLOC_ADD) { char *bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); /* spa_vdev_add() expects feature to be enabled */ if (spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { return (SET_ERROR(ENOTSUP)); } } } vd = vdev_alloc_common(spa, id, guid, ops); vic = &vd->vdev_indirect_config; vd->vdev_islog = islog; vd->vdev_nparity = nparity; if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); /* * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a * fault on a vdev and want it to persist across imports (like with * zpool offline -f). */ rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp); if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_faulted = 1; vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) vd->vdev_devid = spa_strdup(vd->vdev_devid); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &vd->vdev_physpath) == 0) vd->vdev_physpath = spa_strdup(vd->vdev_physpath); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &vd->vdev_enc_sysfs_path) == 0) vd->vdev_enc_sysfs_path = spa_strdup(vd->vdev_enc_sysfs_path); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) vd->vdev_fru = spa_strdup(vd->vdev_fru); /* * Set the whole_disk property. If it's not specified, leave the value * as -1. */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); ASSERT0(vic->vic_births_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, &vic->vic_births_object); ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, &vic->vic_prev_indirect_vdev); /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present); /* * Get the alignment requirement. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); /* * Retrieve the vdev creation time. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); /* * If we're a top-level vdev, try to load the allocation parameters. */ if (top_level && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); } else { ASSERT0(vd->vdev_top_zap); } if (top_level && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); /* Note: metaslab_group_create() is now deferred */ } if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); } else { ASSERT0(vd->vdev_leaf_zap); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } if (alloctype == VDEV_ALLOC_ROOTPOOL) { uint64_t spare = 0; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && spare) spa_spare_add(vd); } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, &vd->vdev_rebuild_txg); if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) vdev_defer_resilver(vd); /* * In general, when importing a pool we want to ignore the * persistent fault state, as the diagnosis made on another * system may not be valid in the current context. The only * exception is if we forced a vdev to a persistently faulted * state with 'zpool offline -f'. The persistent fault will * remain across imports until cleared. * * Local vdevs will remain in the faulted state. */ if (spa_load_state(spa) == SPA_LOAD_OPEN || spa_load_state(spa) == SPA_LOAD_IMPORT) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && strcmp(aux, "external") == 0) vd->vdev_label_aux = VDEV_AUX_EXTERNAL; else vd->vdev_faulted = 0ULL; } } } /* * Add ourselves to the parent's list of children. */ vdev_add_child(parent, vd); *vdp = vd; return (0); } void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); /* * Scan queues are normally destroyed at the end of a scan. If the * queue exists here, that implies the vdev is being removed while * the scan is still running. */ if (vd->vdev_scan_io_queue != NULL) { mutex_enter(&vd->vdev_scan_io_queue_lock); dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); vd->vdev_scan_io_queue = NULL; mutex_exit(&vd->vdev_scan_io_queue_lock); } /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. */ vdev_remove_child(vd->vdev_parent, vd); ASSERT(vd->vdev_parent == NULL); ASSERT(!list_link_active(&vd->vdev_leaf_node)); /* * Clean up vdev structure. */ vdev_queue_fini(vd); vdev_cache_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); if (vd->vdev_devid) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); if (vd->vdev_enc_sysfs_path) spa_strfree(vd->vdev_enc_sysfs_path); if (vd->vdev_fru) spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); EQUIV(vd->vdev_indirect_births != NULL, vd->vdev_indirect_mapping != NULL); if (vd->vdev_indirect_births != NULL) { vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vdev_indirect_births_close(vd->vdev_indirect_births); } if (vd->vdev_obsolete_sm != NULL) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; } range_tree_destroy(vd->vdev_obsolete_segments); rw_destroy(&vd->vdev_indirect_rwlock); mutex_destroy(&vd->vdev_obsolete_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); mutex_destroy(&vd->vdev_initialize_lock); mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); cv_destroy(&vd->vdev_initialize_cv); mutex_destroy(&vd->vdev_trim_lock); mutex_destroy(&vd->vdev_autotrim_lock); mutex_destroy(&vd->vdev_trim_io_lock); cv_destroy(&vd->vdev_trim_cv); cv_destroy(&vd->vdev_autotrim_cv); cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); mutex_destroy(&vd->vdev_rebuild_io_lock); cv_destroy(&vd->vdev_rebuild_cv); cv_destroy(&vd->vdev_rebuild_io_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; kmem_free(vd, sizeof (vdev_t)); } /* * Transfer top-level vdev state from svd to tvd. */ static void vdev_top_transfer(vdev_t *svd, vdev_t *tvd) { spa_t *spa = svd->vdev_spa; metaslab_t *msp; vdev_t *vd; int t; ASSERT(tvd == tvd->vdev_top); tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite; tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; tvd->vdev_top_zap = svd->vdev_top_zap; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; svd->vdev_top_zap = 0; if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; tvd->vdev_alloc_bias = svd->vdev_alloc_bias; svd->vdev_alloc_bias = VDEV_BIAS_NONE; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; svd->vdev_stat.vs_alloc = 0; svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; /* * State which may be set on a top-level vdev that's in the * process of being removed. */ ASSERT0(tvd->vdev_indirect_config.vic_births_object); ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); ASSERT0(tvd->vdev_removing); ASSERT0(tvd->vdev_rebuilding); tvd->vdev_removing = svd->vdev_removing; tvd->vdev_rebuilding = svd->vdev_rebuilding; tvd->vdev_rebuild_config = svd->vdev_rebuild_config; tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; range_tree_swap(&svd->vdev_obsolete_segments, &tvd->vdev_obsolete_segments); tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; svd->vdev_indirect_config.vic_mapping_object = 0; svd->vdev_indirect_config.vic_births_object = 0; svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; svd->vdev_indirect_mapping = NULL; svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; svd->vdev_removing = 0; svd->vdev_rebuilding = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } if (list_link_active(&svd->vdev_state_dirty_node)) { vdev_state_clean(svd); vdev_state_dirty(tvd); } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; dsl_scan_io_queue_vdev_xfer(svd, tvd); } static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { if (vd == NULL) return; vd->vdev_top = tvd; for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } /* * Add a mirror/replacing vdev above an existing vdev. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) { spa_t *spa = cvd->vdev_spa; vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_psize = cvd->vdev_psize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); return (mvd); } /* * Remove a 1-way mirror/replacing vdev from the tree. */ void vdev_remove_parent(vdev_t *cvd) { vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; /* * If pool not set for autoexpand, we need to also preserve * mvd's asize to prevent automatic expansion of cvd. * Otherwise if we are adjusting the mirror by attaching and * detaching children of non-uniform sizes, the mirror could * autoexpand, unexpectedly requiring larger devices to * re-establish the mirror. */ if (!cvd->vdev_spa->spa_autoexpand) cvd->vdev_asize = mvd->vdev_asize; } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); ASSERT(mvd->vdev_children == 0); vdev_free(mvd); } static void vdev_metaslab_group_create(vdev_t *vd) { spa_t *spa = vd->vdev_spa; /* * metaslab_group_create was delayed until allocation bias was available */ if (vd->vdev_mg == NULL) { metaslab_class_t *mc; if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) vd->vdev_alloc_bias = VDEV_BIAS_LOG; ASSERT3U(vd->vdev_islog, ==, (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); switch (vd->vdev_alloc_bias) { case VDEV_BIAS_LOG: mc = spa_log_class(spa); break; case VDEV_BIAS_SPECIAL: mc = spa_special_class(spa); break; case VDEV_BIAS_DEDUP: mc = spa_dedup_class(spa); break; default: mc = spa_normal_class(spa); } vd->vdev_mg = metaslab_group_create(mc, vd, spa->spa_alloc_count); /* * The spa ashift values currently only reflect the * general vdev classes. Class destination is late * binding so ashift checking had to wait until now */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; } } } int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; uint64_t m; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; boolean_t expanding = (oldc != 0); ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); /* * This vdev is not being allocated from yet or is a hole. */ if (vd->vdev_ms_shift == 0) return (0); ASSERT(!vd->vdev_ishole); ASSERT(oldc <= newc); mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (expanding) { bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; for (m = oldc; m < newc; m++) { uint64_t object = 0; /* * vdev_ms_array may be 0 if we are creating the "fake" * metaslabs for an indirect vdev for zdb's leak detection. * See zdb_leak_init(). */ if (txg == 0 && vd->vdev_ms_array != 0) { error = dmu_read(mos, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error != 0) { vdev_dbgmsg(vd, "unable to read the metaslab " "array [error=%d]", error); return (error); } } #ifndef _KERNEL /* * To accommodate zdb_leak_init() fake indirect * metaslabs, we allocate a metaslab group for * indirect vdevs which normally don't have one. */ if (vd->vdev_mg == NULL) { ASSERT0(vdev_is_concrete(vd)); vdev_metaslab_group_create(vd); } #endif error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", error); return (error); } } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* * If the vdev is being removed we don't activate * the metaslabs since we want to ensure that no new * allocations are performed on this device. */ if (!expanding && !vd->vdev_removing) { metaslab_group_activate(vd->vdev_mg); } if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); /* * Regardless whether this vdev was just added or it is being * expanded, the metaslab count has changed. Recalculate the * block limit. */ spa_log_sm_set_blocklimit(spa); return (0); } void vdev_metaslab_fini(vdev_t *vd) { if (vd->vdev_checkpoint_sm != NULL) { ASSERT(spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_POOL_CHECKPOINT)); space_map_close(vd->vdev_checkpoint_sm); /* * Even though we close the space map, we need to set its * pointer to NULL. The reason is that vdev_metaslab_fini() * may be called multiple times for certain operations * (i.e. when destroying a pool) so we need to ensure that * this clause never executes twice. This logic is similar * to the one used for the vdev_ms clause below. */ vd->vdev_checkpoint_sm = NULL; } if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; metaslab_group_passivate(mg); uint64_t count = vd->vdev_ms_count; for (uint64_t m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) metaslab_fini(msp); } vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; vd->vdev_ms_count = 0; for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) ASSERT0(mg->mg_histogram[i]); } ASSERT0(vd->vdev_ms_count); ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; int vps_flags; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; zio_link_t *zl; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); - zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, + (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0, 0); zio->io_error = SET_ERROR(ENXIO); } mutex_enter(&vd->vdev_probe_lock); ASSERT(vd->vdev_probe_zio == zio); vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* * Determine whether this device is accessible. * * Read and write to several known locations: the pad regions of each * vdev label but the first, which we leave alone in case it contains * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; vdev_probe_stats_t *vps = NULL; zio_t *pio; ASSERT(vd->vdev_ops->vdev_op_leaf); /* * Don't probe the probe. */ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) return (NULL); /* * To prevent 'probe storms' when a device fails, we create * just one probe i/o at a time. All zios that want to probe * this vdev will become parents of the probe io. */ mutex_enter(&vd->vdev_probe_lock); if ((pio = vd->vdev_probe_zio) == NULL) { vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* * vdev_cant_read and vdev_cant_write can only * transition from TRUE to FALSE when we have the * SCL_ZIO lock as writer; otherwise they can only * transition from FALSE to TRUE. This ensures that * any zio looking at these values can assume that * failures persist for the life of the I/O. That's * important because when a device has intermittent * connectivity problems, we want to ensure that * they're ascribed to the device (ENXIO) and not * the zio (EIO). * * Since we hold SCL_ZIO as writer here, clear both * values so the probe can reevaluate from first * principles. */ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; } vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); /* * We can't change the vdev state in this context, so we * kick off an async task to do it on our behalf. */ if (zio != NULL) { vd->vdev_probe_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_PROBE); } } if (zio != NULL) zio_add_child(zio, pio); mutex_exit(&vd->vdev_probe_lock); if (vps == NULL) { ASSERT(zio != NULL); return (NULL); } for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } if (zio == NULL) return (pio); zio_nowait(pio); return (NULL); } static void vdev_open_child(void *arg) { vdev_t *vd = arg; vd->vdev_open_thread = curthread; vd->vdev_open_error = vdev_open(vd); vd->vdev_open_thread = NULL; } static boolean_t vdev_uses_zvols(vdev_t *vd) { #ifdef _KERNEL if (zvol_is_zvol(vd->vdev_path)) return (B_TRUE); #endif for (int c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } void vdev_open_children(vdev_t *vd) { taskq_t *tq; int children = vd->vdev_children; /* * in order to handle pools on top of zvols, do the opens * in a single thread so that the same thread holds the * spa_namespace_lock */ if (vdev_uses_zvols(vd)) { retry_sync: for (int c = 0; c < children; c++) vd->vdev_child[c]->vdev_open_error = vdev_open(vd->vdev_child[c]); } else { tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); if (tq == NULL) goto retry_sync; for (int c = 0; c < children; c++) VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID); taskq_destroy(tq); } vd->vdev_nonrot = B_TRUE; for (int c = 0; c < children; c++) vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; } /* * Compute the raidz-deflation ratio. Note, we hard-code * in 128k (1 << 17) because it is the "typical" blocksize. * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, * otherwise it would inconsistently account for existing bp's. */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); } } /* * Prepare a virtual device for access. */ int vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; uint64_t logical_ashift = 0; uint64_t physical_ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* * If this vdev is not removed, check its fault status. If it's * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &logical_ashift, &physical_ashift); /* * Physical volume size should never be larger than its max size, unless * the disk has shrunk while we were reading it or the device is buggy * or damaged: either way it's not safe for use, bail out of the open. */ if (osize > max_osize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_OPEN_FAILED); return (SET_ERROR(ENXIO)); } /* * Reset the vdev_reopening flag so that we actually close * the vdev on error. */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO)); if (error) { if (vd->vdev_removed && vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, vd->vdev_stat.vs_aux); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); } return (error); } vd->vdev_removed = B_FALSE; /* * Recheck the faulted flag now that we have confirmed that * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } /* * For hole or missing vdevs we just return success. */ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); max_asize = max_osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; max_asize = max_osize; } /* * If the vdev was expanded, record this so that we can re-create the * uberblock rings in labels {2,3}, during the next sync. */ if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0)) vd->vdev_copy_uberblocks = B_TRUE; vd->vdev_psize = psize; /* * Make sure the allocatable size hasn't shrunk too much. */ if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EINVAL)); } vd->vdev_physical_ashift = MAX(physical_ashift, vd->vdev_physical_ashift); vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); if (vd->vdev_logical_ashift > ASHIFT_MAX) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_ASHIFT_TOO_BIG); return (SET_ERROR(EDOM)); } if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For compatibility, a different ashift can be requested. */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || vd->vdev_ashift > ASHIFT_MAX)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_ASHIFT); return (SET_ERROR(EDOM)); } } else { /* * Make sure the alignment required hasn't increased. */ if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { - zfs_ereport_post(FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, + (void) zfs_ereport_post( + FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, spa, vd, NULL, NULL, 0, 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EDOM)); } vd->vdev_max_asize = max_asize; } /* * If all children are healthy we update asize if either: * The asize has increased, due to a device expansion caused by dynamic * LUN growth or vdev replacement, and automatic expansion is enabled; * making the additional space available. * * The asize has decreased, due to a device shrink usually caused by a * vdev replace with a smaller device. This ensures that calculations * based of max_asize and asize e.g. esize are always valid. It's safe * to do this as we've already validated that asize is greater than * vdev_min_asize. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && ((asize > vd->vdev_asize && (vd->vdev_expanding || spa->spa_autoexpand)) || (asize < vd->vdev_asize))) vd->vdev_asize = asize; vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); return (error); } /* * Track the min and max ashift values for normal data devices. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && vd->vdev_alloc_bias == VDEV_BIAS_NONE && vd->vdev_islog == 0 && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; } /* * If this is a leaf vdev, assess whether a resilver is needed. * But don't do this if we are doing a reopen for a scrub, since * this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen) dsl_scan_assess_vdev(spa->spa_dsl_pool, vd); return (0); } /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; nvlist_t *label; uint64_t guid = 0, aux_guid = 0, top_guid; uint64_t state; nvlist_t *nvl; uint64_t txg; if (vdev_validate_skip) return (0); for (uint64_t c = 0; c < vd->vdev_children; c++) if (vdev_validate(vd->vdev_child[c]) != 0) return (SET_ERROR(EBADF)); /* * If the device has already failed, or was marked offline, don't do * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) return (0); /* * If we are performing an extreme rewind, we allow for a label that * was modified at a point after the current txg. * If config lock is not held do not check for the txg. spa_sync could * be updating the vdev's label before updating spa_last_synced_txg. */ if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) txg = UINT64_MAX; else txg = spa_last_synced_txg(spa); if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); vdev_dbgmsg(vd, "vdev_validate: failed reading config for " "txg %llu", (u_longlong_t)txg); return (0); } /* * Determine if this vdev has been split off into another * pool. If so, then refuse to open it. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, &aux_guid) == 0 && aux_guid == spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_SPLIT_POOL); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_GUID); return (0); } /* * If config is not trusted then ignore the spa guid check. This is * necessary because if the machine crashed during a re-guid the new * guid might have been written to all of the vdev labels, but not the * cached config. The check will be performed again once we have the * trusted config from the MOS. */ if (spa->spa_trust_config && guid != spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " "match config (%llu != %llu)", (u_longlong_t)guid, (u_longlong_t)spa_guid(spa)); return (0); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, &aux_guid) != 0) aux_guid = 0; if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_GUID); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_TOP_GUID); return (0); } /* * If this vdev just became a top-level vdev because its sibling was * detached, it will have adopted the parent's vdev guid -- but the * label may or may not be on disk yet. Fortunately, either version * of the label will have the same top guid, so if we're a top-level * vdev, we can safely compare to that instead. * However, if the config comes from a cachefile that failed to update * after the detach, a top-level vdev will appear as a non top-level * vdev in the config. Also relax the constraints if we perform an * extreme rewind. * * If we split this vdev off instead, then we also check the * original pool's guid. We don't want to consider the vdev * corrupt if it is partway through a split operation. */ if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { boolean_t mismatch = B_FALSE; if (spa->spa_trust_config && !spa->spa_extreme_rewind) { if (vd != vd->vdev_top || vd->vdev_guid != top_guid) mismatch = B_TRUE; } else { if (vd->vdev_guid != top_guid && vd->vdev_top->vdev_guid != guid) mismatch = B_TRUE; } if (mismatch) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: config guid " "doesn't match label guid"); vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)vd->vdev_top->vdev_guid); vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " "aux_guid %llu", (u_longlong_t)guid, (u_longlong_t)top_guid, (u_longlong_t)aux_guid); return (0); } } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_STATE); return (0); } nvlist_free(label); /* * If this is a verbatim import, no need to check the * state of the pool. */ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) { vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " "for spa %s", (u_longlong_t)state, spa->spa_name); return (SET_ERROR(EBADF)); } /* * If we were able to open and validate a vdev that was * previously marked permanently unavailable, clear that state * now. */ if (vd->vdev_not_present) vd->vdev_not_present = 0; return (0); } static void vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) { if (svd->vdev_path != NULL && dvd->vdev_path != NULL) { if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) { zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed " "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, dvd->vdev_path, svd->vdev_path); spa_strfree(dvd->vdev_path); dvd->vdev_path = spa_strdup(svd->vdev_path); } } else if (svd->vdev_path != NULL) { dvd->vdev_path = spa_strdup(svd->vdev_path); zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", (u_longlong_t)dvd->vdev_guid, dvd->vdev_path); } } /* * Recursively copy vdev paths from one vdev to another. Source and destination * vdev trees must have same geometry otherwise return error. Intended to copy * paths from userland config into MOS config. */ int vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) { if ((svd->vdev_ops == &vdev_missing_ops) || (svd->vdev_ishole && dvd->vdev_ishole) || (dvd->vdev_ops == &vdev_indirect_ops)) return (0); if (svd->vdev_ops != dvd->vdev_ops) { vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); return (SET_ERROR(EINVAL)); } if (svd->vdev_guid != dvd->vdev_guid) { vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " "%llu)", (u_longlong_t)svd->vdev_guid, (u_longlong_t)dvd->vdev_guid); return (SET_ERROR(EINVAL)); } if (svd->vdev_children != dvd->vdev_children) { vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " "%llu != %llu", (u_longlong_t)svd->vdev_children, (u_longlong_t)dvd->vdev_children); return (SET_ERROR(EINVAL)); } for (uint64_t i = 0; i < svd->vdev_children; i++) { int error = vdev_copy_path_strict(svd->vdev_child[i], dvd->vdev_child[i]); if (error != 0) return (error); } if (svd->vdev_ops->vdev_op_leaf) vdev_copy_path_impl(svd, dvd); return (0); } static void vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) { ASSERT(stvd->vdev_top == stvd); ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); for (uint64_t i = 0; i < dvd->vdev_children; i++) { vdev_copy_path_search(stvd, dvd->vdev_child[i]); } if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) return; /* * The idea here is that while a vdev can shift positions within * a top vdev (when replacing, attaching mirror, etc.) it cannot * step outside of it. */ vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) return; ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_copy_path_impl(vd, dvd); } /* * Recursively copy vdev paths from one root vdev to another. Source and * destination vdev trees may differ in geometry. For each destination leaf * vdev, search a vdev with the same guid and top vdev id in the source. * Intended to copy paths from userland config into MOS config. */ void vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) { uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); ASSERT(srvd->vdev_ops == &vdev_root_ops); ASSERT(drvd->vdev_ops == &vdev_root_ops); for (uint64_t i = 0; i < children; i++) { vdev_copy_path_search(srvd->vdev_child[i], drvd->vdev_child[i]); } } /* * Close a virtual device. */ void vdev_close(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; spa_t *spa __maybe_unused = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are * going offline. */ if (pvd != NULL && pvd->vdev_reopening) vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); vdev_cache_purge(vd); /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ vd->vdev_prevstate = vd->vdev_state; if (vd->vdev_offline) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void vdev_hold(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); if (spa->spa_state == POOL_STATE_UNINITIALIZED) return; for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_hold(vd); } void vdev_rele(vdev_t *vd) { ASSERT(spa_is_root(vd->vdev_spa)); for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_rele(vd); } /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock * on the spa_config_lock. Instead we only obtain the leaf's physical size. * If the leaf has never been opened then open it, as usual. */ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); /* * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). */ if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache) { /* * In case the vdev is present we should evict all ARC * buffers and pointers to log blocks and reclaim their * space before restoring its contents to L2ARC. */ if (l2arc_vdev_present(vd)) { l2arc_rebuild_vdev(vd, B_TRUE); } else { l2arc_add_vdev(spa, vd); } spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } else { (void) vdev_validate(vd); } /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); } int vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) { int error; /* * Normally, partial opens (e.g. of a mirror) are allowed. * For a create, however, we want to fail the request if * there are any components we can't open. */ error = vdev_open(vd); if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); return (error ? error : SET_ERROR(ENXIO)); } /* * Recursively load DTLs and initialize all labels. */ if ((error = vdev_dtl_load(vd)) != 0 || (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } return (0); } void vdev_metaslab_set_size(vdev_t *vd) { uint64_t asize = vd->vdev_asize; uint64_t ms_count = asize >> zfs_vdev_default_ms_shift; uint64_t ms_shift; /* * There are two dimensions to the metaslab sizing calculation: * the size of the metaslab and the count of metaslabs per vdev. * * The default values used below are a good balance between memory * usage (larger metaslab size means more memory needed for loaded * metaslabs; more metaslabs means more memory needed for the * metaslab_t structs), metaslab load time (larger metaslabs take * longer to load), and metaslab sync time (more metaslabs means * more time spent syncing all of them). * * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs. * The range of the dimensions are as follows: * * 2^29 <= ms_size <= 2^34 * 16 <= ms_count <= 131,072 * * On the lower end of vdev sizes, we aim for metaslabs sizes of * at least 512MB (2^29) to minimize fragmentation effects when * testing with smaller devices. However, the count constraint * of at least 16 metaslabs will override this minimum size goal. * * On the upper end of vdev sizes, we aim for a maximum metaslab * size of 16GB. However, we will cap the total count to 2^17 * metaslabs to keep our memory footprint in check and let the * metaslab size grow from there if that limit is hit. * * The net effect of applying above constrains is summarized below. * * vdev size metaslab count * --------------|----------------- * < 8GB ~16 * 8GB - 100GB one per 512MB * 100GB - 3TB ~200 * 3TB - 2PB one per 16GB * > 2PB ~131,072 * -------------------------------- * * Finally, note that all of the above calculate the initial * number of metaslabs. Expanding a top-level vdev will result * in additional metaslabs being allocated making it possible * to exceed the zfs_vdev_ms_count_limit. */ if (ms_count < zfs_vdev_min_ms_count) ms_shift = highbit64(asize / zfs_vdev_min_ms_count); else if (ms_count > zfs_vdev_default_ms_count) ms_shift = highbit64(asize / zfs_vdev_default_ms_count); else ms_shift = zfs_vdev_default_ms_shift; if (ms_shift < SPA_MAXBLOCKSHIFT) { ms_shift = SPA_MAXBLOCKSHIFT; } else if (ms_shift > zfs_vdev_max_ms_shift) { ms_shift = zfs_vdev_max_ms_shift; /* cap the total count to constrain memory footprint */ if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); } vd->vdev_ms_shift = ms_shift; ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); } /* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining * administrator defined limits and ensuring it doesn't go below the * logical ashift. */ void vdev_ashift_optimize(vdev_t *vd) { if (vd == vd->vdev_top) { if (vd->vdev_ashift < vd->vdev_physical_ashift) { vd->vdev_ashift = MIN( MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), MAX(zfs_vdev_min_auto_ashift, vd->vdev_physical_ashift)); } else { /* * Unusual case where logical ashift > physical ashift * so we can't cap the calculated ashift based on max * ashift as that would cause failures. * We still check if we need to increase it to match * the min ashift. */ vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift, vd->vdev_ashift); } } } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); /* indirect vdevs don't have metaslabs or dtls */ ASSERT(vdev_is_concrete(vd) || flags == 0); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); if (flags & VDD_DTL) (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { for (int c = 0; c < vd->vdev_children; c++) vdev_dirty_leaves(vd->vdev_child[c], flags, txg); if (vd->vdev_ops->vdev_op_leaf) vdev_dirty(vd->vdev_top, flags, vd, txg); } /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * * DTL_PARTIAL: txgs for which data is available, but not fully replicated * * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of * txgs that was scrubbed. * * DTL_OUTAGE: txgs which cannot currently be read, whether due to * persistent errors or just some device being offline. * Unlike the other three, the DTL_OUTAGE map is not generally * maintained; it's only computed when needed, typically to * determine whether a device can be detached. * * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device * either has the data or it doesn't. * * For interior vdevs such as mirror and RAID-Z the picture is more complex. * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because * if any child is less than fully replicated, then so is its parent. * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, * comprising only those txgs which appear in 'maxfaults' or more children; * those are the txgs we don't have enough replication to read. For example, * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); * thus, its DTL_MISSING consists of the set of txgs that appear in more than * two child DTL_MISSING maps. * * It should be clear from the above that to compute the DTLs and outage maps * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. * Therefore, that is all we keep on disk. When loading the pool, or after * a configuration change, we generate all other DTLs from first principles. */ void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_contains(rt, txg, size)) range_tree_add(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); /* * While we are loading the pool, the DTLs have not been loaded yet. * Ignore the DTLs and try all devices. This avoids a recursive * mutex enter on the vdev_dtl_lock, and also makes us try hard * when loading the pool (relying on the checksum to ensure that * we get the right data -- note that we while loading, we are * only reading the MOS, which is always checksummed). */ if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE) return (B_FALSE); mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(rt)) dirty = range_tree_contains(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); return (dirty); } boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(&vd->vdev_dtl_lock); empty = range_tree_is_empty(rt); mutex_exit(&vd->vdev_dtl_lock); return (empty); } /* * Returns B_TRUE if vdev determines offset needs to be resilvered. */ boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); if (vd->vdev_ops->vdev_op_need_resilver == NULL || vd->vdev_ops->vdev_op_leaf) return (B_TRUE); return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize)); } /* * Returns the lowest txg in the DTL range. */ static uint64_t vdev_dtl_min(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); } /* * Returns the highest txg in the DTL. */ static uint64_t vdev_dtl_max(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (range_tree_max(vd->vdev_dtl[DTL_MISSING])); } /* * Determine if a resilvering vdev should remove any DTL entries from * its range. If the vdev was resilvering for the entire duration of the * scan then it should excise that range from its DTLs. Otherwise, this * vdev is considered partially resilvered and should leave its DTL * entries intact. The comment in vdev_dtl_reassess() describes how we * excise the DTLs. */ static boolean_t vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) { ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) return (B_FALSE); if (vd->vdev_resilver_deferred) return (B_FALSE); if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); if (rebuild_done) { vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; /* Rebuild not initiated by attach */ if (vd->vdev_rebuild_txg == 0) return (B_TRUE); /* * When a rebuild completes without error then all missing data * up to the rebuild max txg has been reconstructed and the DTL * is eligible for excision. */ if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE && vdev_dtl_max(vd) <= vrp->vrp_max_txg) { ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg); ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg); return (B_TRUE); } } else { dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys; /* Resilver not initiated by attach */ if (vd->vdev_resilver_txg == 0) return (B_TRUE); /* * When a resilver is initiated the scan will assign the * scn_max_txg value to the highest txg value that exists * in all DTLs. If this device's max DTL is not part of this * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg] * then it is not eligible for excision. */ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg); ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg); return (B_TRUE); } } return (B_FALSE); } /* * Reassess DTLs after a config change or scrub completion. If txg == 0 no * write operations will be issued to the pool. */ void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; int minref; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess(vd->vdev_child[c], txg, scrub_txg, scrub_done, rebuild_done); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; boolean_t check_excise = B_FALSE; boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); /* * If requested, pretend the scan or rebuild completed cleanly. */ if (zfs_scan_ignore_errors) { if (scn != NULL) scn->scn_phys.scn_errors = 0; if (vr != NULL) vr->vr_rebuild_phys.vrp_errors = 0; } if (scrub_txg != 0 && !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { wasempty = B_FALSE; zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " "dtl:%llu/%llu errors:%llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg, (u_longlong_t)scrub_txg, spa->spa_scrub_started, (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd), (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0)); } /* * If we've completed a scrub/resilver or a rebuild cleanly * then determine if this vdev should remove any DTLs. We * only want to excise regions on vdevs that were available * during the entire duration of this scan. */ if (rebuild_done && vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) { check_excise = B_TRUE; } else { if (spa->spa_scrub_started || (scn != NULL && scn->scn_phys.scn_errors == 0)) { check_excise = B_TRUE; } } if (scrub_txg && check_excise && vdev_dtl_should_excise(vd, rebuild_done)) { /* * We completed a scrub, resilver or rebuild up to * scrub_txg. If we did it without rebooting, then * the scrub dtl will be valid, so excise the old * region and fold in the scrub dtl. Otherwise, * leave the dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference * tree and then add a segment with refcnt -1 that * covers the range [0, scrub_txg). This means * that each txg in that range has refcnt -1 or 0. * We then add DTL_SCRUB with a refcnt of 2, so that * entries in the range [0, scrub_txg) will have a * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ space_reftree_create(&reftree); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_add_seg(&reftree, 0, scrub_txg, -1); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_SCRUB], 2); space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { zfs_dbgmsg("update DTL_MISSING:%llu/%llu", (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd)); } else if (!wasempty) { zfs_dbgmsg("DTL_MISSING is now empty"); } } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); if (!vdev_readable(vd)) range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); else range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); /* * If the vdev was resilvering or rebuilding and no longer * has any DTLs then reset the appropriate flag and dirty * the top level so that we persist the change. */ if (txg != 0 && range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { if (vd->vdev_rebuild_txg != 0) { vd->vdev_rebuild_txg = 0; vdev_config_dirty(vd->vdev_top); } else if (vd->vdev_resilver_txg != 0) { vd->vdev_resilver_txg = 0; vdev_config_dirty(vd->vdev_top); } } mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); return; } mutex_enter(&vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) { /* account for child's outage in parent's missing map */ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) minref = 1; /* i.e. non-zero */ else if (vd->vdev_nparity != 0) minref = vd->vdev_nparity + 1; /* RAID-Z */ else minref = vd->vdev_children; /* any kind of mirror */ space_reftree_create(&reftree); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(vdev_is_concrete(vd)); error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); mutex_enter(&vd->vdev_dtl_lock); error = space_map_load(vd->vdev_dtl_sm, vd->vdev_dtl[DTL_MISSING], SM_ALLOC); mutex_exit(&vd->vdev_dtl_lock); return (error); } for (int c = 0; c < vd->vdev_children; c++) { error = vdev_dtl_load(vd->vdev_child[c]); if (error != 0) break; } return (error); } static void vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; const char *string; ASSERT(alloc_bias != VDEV_BIAS_NONE); string = (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; ASSERT(string != NULL); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, strlen(string) + 1, string, tx)); if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { spa_activate_allocation_classes(spa, tx); } } void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zapobj, tx)); } uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); ASSERT(zap != 0); VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zap, tx)); return (zap); } void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ops != &vdev_hole_ops && vd->vdev_ops != &vdev_missing_ops && vd->vdev_ops != &vdev_root_ops && !vd->vdev_top->vdev_removing) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) vdev_zap_allocation_data(vd, tx); } } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } } static void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; range_tree_t *rtsync; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached || vd->vdev_top->vdev_removing) { mutex_enter(&vd->vdev_dtl_lock); space_map_free(vd->vdev_dtl_sm, tx); space_map_close(vd->vdev_dtl_sm); vd->vdev_dtl_sm = NULL; mutex_exit(&vd->vdev_dtl_lock); /* * We only destroy the leaf ZAP for detached leaves or for * removed log devices. Removed data devices handle leaf ZAP * cleanup later, once cancellation is no longer possible. */ if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || vd->vdev_top->vdev_islog)) { vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); vd->vdev_leaf_zap = 0; } dmu_tx_commit(tx); return; } if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 0, -1ULL, 0)); ASSERT(vd->vdev_dtl_sm != NULL); } rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(rtsync, NULL, NULL); range_tree_destroy(rtsync); /* * If the object for the space map has changed then dirty * the top level so that we update the config. */ if (object != space_map_object(vd->vdev_dtl_sm)) { vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " "new object %llu", (u_longlong_t)txg, spa_name(spa), (u_longlong_t)object, (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); vdev_config_dirty(vd->vdev_top); } dmu_tx_commit(tx); } /* * Determine whether the specified vdev can be offlined/detached/removed * without losing data. */ boolean_t vdev_dtl_required(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == spa->spa_root_vdev || vd == tvd) return (B_TRUE); /* * Temporarily mark the device as unreadable, and then determine * whether this results in any DTL outages in the top-level vdev. * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); if (!required && zio_injection_enabled) { required = !!zio_handle_device_injection(vd, NULL, SET_ERROR(ECHILD)); } return (required); } /* * Determine if resilver is needed, and if so the txg range. */ boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) { boolean_t needed = B_FALSE; uint64_t thismin = UINT64_MAX; uint64_t thismax = 0; if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; if (vdev_resilver_needed(cvd, &cmin, &cmax)) { thismin = MIN(thismin, cmin); thismax = MAX(thismax, cmax); needed = B_TRUE; } } } if (needed && minp) { *minp = thismin; *maxp = thismax; } return (needed); } /* * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj * will contain either the checkpoint spacemap object or zero if none exists. * All other errors are returned to the caller. */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *sm_obj = 0; return (0); } int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj); if (error == ENOENT) { *sm_obj = 0; error = 0; } return (error); } int vdev_load(vdev_t *vd) { int error = 0; /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) { error = vdev_load(vd->vdev_child[c]); if (error != 0) { return (error); } } vdev_set_deflate_ratio(vd); /* * On spa_load path, grab the allocation bias from our zap */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; char bias_str[64]; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), bias_str); if (error == 0) { ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); } else if (error != ENOENT) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", vd->vdev_top_zap, error); return (error); } } /* * Load any rebuild state from the top-level vdev zap. */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { error = vdev_rebuild_load(vd); if (error && error != ENOTSUP) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load " "failed [error=%d]", error); return (error); } } /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " "asize=%llu", (u_longlong_t)vd->vdev_ashift, (u_longlong_t)vd->vdev_asize); return (SET_ERROR(ENXIO)); } error = vdev_metaslab_init(vd, 0); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " "[error=%d]", error); vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (error); } uint64_t checkpoint_sm_obj; error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj); if (error == 0 && checkpoint_sm_obj != 0) { objset_t *mos = spa_meta_objset(vd->vdev_spa); ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); error = space_map_open(&vd->vdev_checkpoint_sm, mos, checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: space_map_open " "failed for checkpoint spacemap (obj %llu) " "[error=%d]", (u_longlong_t)checkpoint_sm_obj, error); return (error); } ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since the checkpoint_sm contains free entries * exclusively we can use space_map_allocated() to * indicate the cumulative checkpointed space that * has been freed. */ vd->vdev_stat.vs_checkpoint_space = -space_map_allocated(vd->vdev_checkpoint_sm); vd->vdev_spa->spa_checkpoint_info.sci_dspace += vd->vdev_stat.vs_checkpoint_space; } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve " "checkpoint space map object from vdev ZAP " "[error=%d]", error); return (error); } } /* * If this is a leaf vdev, load its DTL. */ if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " "[error=%d]", error); return (error); } uint64_t obsolete_sm_object; error = vdev_obsolete_sm_object(vd, &obsolete_sm_object); if (error == 0 && obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, obsolete_sm_object, 0, vd->vdev_asize, 0))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " "obsolete spacemap (obj %llu) [error=%d]", (u_longlong_t)obsolete_sm_object, error); return (error); } } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete " "space map object from vdev ZAP [error=%d]", error); return (error); } return (0); } /* * The special vdev case is used for hot spares and l2cache devices. Its * sole purpose it to set the vdev state for the associated vdev. To do this, * we make sure that we can open the underlying device, then try to read the * label, and make sure that the label is sane and that it hasn't been * repurposed to another pool. */ int vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; if (!vdev_readable(vd)) return (0); if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (-1); } /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. */ nvlist_free(label); return (0); } static void vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx) { objset_t *mos = spa_meta_objset(vd->vdev_spa); if (vd->vdev_top_zap == 0) return; uint64_t object = 0; int err = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); if (err == ENOENT) return; VERIFY0(err); VERIFY0(dmu_object_free(mos, object, tx)); VERIFY0(zap_remove(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx)); } /* * Free the objects used to store this vdev's spacemaps, and the array * that points to them. */ void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ms_array == 0) return; objset_t *mos = vd->vdev_spa->spa_meta_objset; uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; size_t array_bytes = array_count * sizeof (uint64_t); uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, array_bytes, smobj_array, 0)); for (uint64_t i = 0; i < array_count; i++) { uint64_t smobj = smobj_array[i]; if (smobj == 0) continue; space_map_free_obj(mos, smobj, tx); } kmem_free(smobj_array, array_bytes); VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); vdev_destroy_ms_flush_data(vd, tx); vd->vdev_ms_array = 0; } static void vdev_remove_empty_log(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); vdev_destroy_spacemaps(vd, tx); if (vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); vd->vdev_top_zap = 0; } dmu_tx_commit(tx); } void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(vdev_is_concrete(vd)); while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) != NULL) metaslab_sync_done(msp, txg); if (reassess) metaslab_sync_reassess(vd->vdev_mg); } void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; ASSERT3U(txg, ==, spa->spa_syncing_txg); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (range_tree_space(vd->vdev_obsolete_segments) > 0) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); vdev_indirect_sync_obsolete(vd, tx); /* * If the vdev is indirect, it can't have dirty * metaslabs or DTLs. */ if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); dmu_tx_commit(tx); return; } } ASSERT(vdev_is_concrete(vd)); if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && !vd->vdev_removing) { ASSERT(vd == vd->vdev_top); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); } while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); /* * If this is an empty log device being removed, destroy the * metadata associated with it. */ if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove_empty_log(vd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); dmu_tx_commit(tx); } uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { return (vd->vdev_ops->vdev_op_asize(vd, psize)); } /* * Mark the given vdev faulted. A faulted vdev behaves as if the device could * not be opened, and no I/O is attempted. */ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); tvd = vd->vdev_top; /* * If user did a 'zpool offline -f' then make the fault persist across * reboots. */ if (aux == VDEV_AUX_EXTERNAL_PERSIST) { /* * There are two kinds of forced faults: temporary and * persistent. Temporary faults go away at pool import, while * persistent faults stay set. Both types of faults can be * cleared with a zpool clear. * * We tell if a vdev is persistently faulted by looking at the * ZPOOL_CONFIG_AUX_STATE nvpair. If it's set to "external" at * import then it's a persistent fault. Otherwise, it's * temporary. We get ZPOOL_CONFIG_AUX_STATE set to "external" * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL. This * tells vdev_config_generate() (which gets run later) to set * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist. */ vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_tmpoffline = B_FALSE; aux = VDEV_AUX_EXTERNAL; } else { vd->vdev_tmpoffline = B_TRUE; } /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we * were faulted. */ vd->vdev_label_aux = aux; /* * Faulted state takes precedence over degraded. */ vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; /* * If we reopen the device and it's not dead, only then do we * mark it degraded. */ vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); } /* * Mark the given vdev degraded. A degraded vdev is purely an indication to the * user that something is wrong. The vdev continues to operate as normal as far * as I/O is concerned. */ int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); /* * If the vdev is already faulted, then don't do anything. */ if (vd->vdev_faulted || vd->vdev_degraded) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); return (spa_vdev_state_exit(spa, vd, 0)); } /* * Online the given vdev. * * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached * spare device should be detached when the device finishes resilvering. * Second, the online should be treated like a 'test' online case, so no FMA * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; boolean_t wasoffline; vdev_state_t oldstate; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand); vd->vdev_expansion_time = gethrestime_sec(); } vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = B_FALSE; } if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && !vdev_is_dead(vd) && vd->vdev_parent && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } /* Restart initializing if necessary */ mutex_enter(&vd->vdev_initialize_lock); if (vdev_writeable(vd) && vd->vdev_initialize_thread == NULL && vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) vdev_initialize(vd); } mutex_exit(&vd->vdev_initialize_lock); /* * Restart trimming if necessary. We do not restart trimming for cache * devices here. This is triggered by l2arc_rebuild_vdev() * asynchronously for the whole device or in l2arc_evict() as it evicts * space for upcoming writes. */ mutex_enter(&vd->vdev_trim_lock); if (vdev_writeable(vd) && !vd->vdev_isl2cache && vd->vdev_trim_thread == NULL && vd->vdev_trim_state == VDEV_TRIM_ACTIVE) { (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial, vd->vdev_trim_secure); } mutex_exit(&vd->vdev_trim_lock); if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); return (spa_vdev_state_exit(spa, vd, 0)); } static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; uint64_t generation; metaslab_group_t *mg; top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, * don't allow it to be offlined. Log devices are always * expendable. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); /* * If the top-level is a slog and it has had allocations * then proceed. We check that the vdev's metaslab group * is not NULL since it's possible that we may have just * added this vdev but not yet initialized its metaslabs. */ if (tvd->vdev_islog && mg != NULL) { /* * Prevent any future allocations. */ metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = spa_reset_logs(spa); /* * If the log device was successfully reset but has * checkpointed data, do not offline it. */ if (error == 0 && tvd->vdev_checkpoint_sm != NULL) { ASSERT3U(space_map_allocated( tvd->vdev_checkpoint_sm), !=, 0); error = ZFS_ERR_CHECKPOINT_EXISTS; } spa_vdev_state_enter(spa, SCL_ALLOC); /* * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); (void) spa_vdev_state_exit(spa, vd, 0); goto top; } ASSERT0(tvd->vdev_stat.vs_alloc); } /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; vdev_reopen(tvd); if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); } /* * Add the device back into the metaslab rotor so that * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { int error; mutex_enter(&spa->spa_vdev_top_lock); error = vdev_offline_locked(spa, guid, flags); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. */ void vdev_clear(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_slow_ios = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); /* * It makes no sense to "clear" an indirect vdev. */ if (!vdev_is_concrete(vd)) return; /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is * written out to disk. */ if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { /* * When reopening in response to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ vd->vdev_forcefault = B_TRUE; vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_stat.vs_aux = 0; vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); /* If a resilver isn't required, check if vdevs can be culled */ if (vd->vdev_aux == NULL && !vdev_is_dead(vd) && !dsl_scan_resilvering(spa->spa_dsl_pool) && !dsl_scan_resilver_scheduled(spa->spa_dsl_pool)) spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } /* * When clearing a FMA-diagnosed fault, we always want to * unspare the device, as we assume that the original spare was * done in response to the FMA fault. */ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; } boolean_t vdev_is_dead(vdev_t *vd) { /* * Holes and missing devices are always considered "dead". * This simplifies the code since we don't have to check for * these types of devices in the various code paths. * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ops == &vdev_hole_ops || vd->vdev_ops == &vdev_missing_ops); } boolean_t vdev_readable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_read); } boolean_t vdev_writeable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_write && vdev_is_concrete(vd)); } boolean_t vdev_allocatable(vdev_t *vd) { uint64_t state = vd->vdev_state; /* * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding * the proper locks. Note that we have to get the vdev state * in a local variable because although it changes atomically, * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write && vdev_is_concrete(vd) && vd->vdev_mg->mg_initialized); } boolean_t vdev_accessible(vdev_t *vd, zio_t *zio) { ASSERT(zio->io_vd == vd); if (vdev_is_dead(vd) || vd->vdev_remove_wanted) return (B_FALSE); if (zio->io_type == ZIO_TYPE_READ) return (!vd->vdev_cant_read); if (zio->io_type == ZIO_TYPE_WRITE) return (!vd->vdev_cant_write); return (B_TRUE); } static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { for (int t = 0; t < VS_ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; } /* * Get extended stats */ static void vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) { int t, b; for (t = 0; t < ZIO_TYPES; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++) vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) { vsx->vsx_total_histo[t][b] += cvsx->vsx_total_histo[t][b]; } } for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) { vsx->vsx_queue_histo[t][b] += cvsx->vsx_queue_histo[t][b]; } vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t]; vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++) vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++) vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b]; } } boolean_t vdev_is_spacemap_addressable(vdev_t *vd) { if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2)) return (B_TRUE); /* * If double-word space map entries are not enabled we assume * 47 bits of the space map entry are dedicated to the entry's * offset (see SM_OFFSET_BITS in space_map.h). We then use that * to calculate the maximum address that can be described by a * space map entry for the given device. */ uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS; if (shift >= 63) /* detect potential overflow */ return (B_TRUE); return (vd->vdev_asize < (1ULL << shift)); } /* * Get statistics for the given vdev. */ static void vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { int t; /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ if (!vd->vdev_ops->vdev_op_leaf) { if (vs) { memset(vs->vs_ops, 0, sizeof (vs->vs_ops)); memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes)); } if (vsx) memset(vsx, 0, sizeof (*vsx)); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex; vdev_get_stats_ex_impl(cvd, cvs, cvsx); if (vs) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); } } else { /* * We're a leaf. Just copy our ZIO active queue stats in. The * other leaf stats are updated in vdev_stat_update(). */ if (!vsx) return; memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) { vsx->vsx_active_queue[t] = vd->vdev_queue.vq_class[t].vqc_active; vsx->vsx_pend_queue[t] = avl_numnodes( &vd->vdev_queue.vq_class[t].vqc_queued_tree); } } } void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { vdev_t *tvd = vd->vdev_top; mutex_enter(&vd->vdev_stat_lock); if (vs) { bcopy(&vd->vdev_stat, vs, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) { vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; /* * Report initializing progress. Since we don't * have the initializing locks held, this is only * an estimate (although a fairly accurate one). */ vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; vs->vs_initialize_state = vd->vdev_initialize_state; vs->vs_initialize_action_time = vd->vdev_initialize_action_time; /* * Report manual TRIM progress. Since we don't have * the manual TRIM locks held, this is only an * estimate (although fairly accurate one). */ vs->vs_trim_notsup = !vd->vdev_has_trim; vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done; vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; vs->vs_trim_state = vd->vdev_trim_state; vs->vs_trim_action_time = vd->vdev_trim_action_time; /* Set when there is a deferred resilver. */ vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } /* * Report expandable space on top-level, non-auxiliary devices * only. The expandable space is reported in terms of metaslab * sized units since that determines how much space the pool * can expand. */ if (vd->vdev_aux == NULL && tvd != NULL) { vs->vs_esize = P2ALIGN( vd->vdev_max_asize - vd->vdev_asize, 1ULL << tvd->vdev_ms_shift); } vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; vs->vs_physical_ashift = vd->vdev_physical_ashift; /* * Report fragmentation and rebuild progress for top-level, * non-auxiliary, concrete devices. */ if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } } vdev_get_stats_ex_impl(vd, vs, vsx); mutex_exit(&vd->vdev_stat_lock); } void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { return (vdev_get_stats_ex(vd, vs, NULL)); } void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_space = 0; vd->vdev_stat.vs_dspace = 0; vd->vdev_stat.vs_alloc = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_scan_stat_init(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (int c = 0; c < vd->vdev_children; c++) vdev_scan_stat_init(vd->vdev_child[c]); mutex_enter(&vd->vdev_stat_lock); vs->vs_scan_processed = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; vdev_stat_t *vs = &vd->vdev_stat; vdev_stat_ex_t *vsx = &vd->vdev_stat_ex; zio_type_t type = zio->io_type; int flags = zio->io_flags; /* * If this i/o is a gang leader, it didn't do any actual work. */ if (zio->io_gang_tree) return; if (zio->io_error == 0) { /* * If this is a root i/o, don't count it -- we've already * counted the top-level vdevs, and vdev_get_stats() will * aggregate them when asked. This reduces contention on * the root vdev_stat_lock and implicitly handles blocks * that compress away to holes, for which there is no i/o. * (Holes never create vdev children, so all the counters * remain zero, which is what we want.) * * Note: this only applies to successful i/o (io_error == 0) * because unlike i/o counts, errors are not additive. * When reading a ditto block, for example, failure of * one top-level vdev does not imply a root-level error. */ if (vd == rvd) return; ASSERT(vd == zio->io_vd); if (flags & ZIO_FLAG_IO_BYPASS) return; mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { /* * Repair is the result of a resilver issued by the * scan thread (spa_sync). */ if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scn_phys = &scn->scn_phys; uint64_t *processed = &scn_phys->scn_processed; if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } /* * Repair is the result of a rebuild issued by the * rebuild thread (vdev_rebuild_thread). */ if (zio->io_priority == ZIO_PRIORITY_REBUILD) { vdev_t *tvd = vd->vdev_top; vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(rebuilt, psize); vs->vs_rebuild_processed += psize; } if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } /* * The bytes/ops/histograms are recorded at the leaf level and * aggregated into the higher level vdevs in vdev_get_stats(). */ if (vd->vdev_ops->vdev_op_leaf && (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { zio_type_t vs_type = type; zio_priority_t priority = zio->io_priority; /* * TRIM ops and bytes are reported to user space as * ZIO_TYPE_IOCTL. This is done to preserve the * vdev_stat_t structure layout for user space. */ if (type == ZIO_TYPE_TRIM) vs_type = ZIO_TYPE_IOCTL; /* * Solely for the purposes of 'zpool iostat -lqrw' * reporting use the priority to catagorize the IO. * Only the following are reported to user space: * * ZIO_PRIORITY_SYNC_READ, * ZIO_PRIORITY_SYNC_WRITE, * ZIO_PRIORITY_ASYNC_READ, * ZIO_PRIORITY_ASYNC_WRITE, * ZIO_PRIORITY_SCRUB, * ZIO_PRIORITY_TRIM. */ if (priority == ZIO_PRIORITY_REBUILD) { priority = ((type == ZIO_TYPE_WRITE) ? ZIO_PRIORITY_ASYNC_WRITE : ZIO_PRIORITY_SCRUB); } else if (priority == ZIO_PRIORITY_INITIALIZING) { ASSERT3U(type, ==, ZIO_TYPE_WRITE); priority = ZIO_PRIORITY_ASYNC_WRITE; } else if (priority == ZIO_PRIORITY_REMOVAL) { priority = ((type == ZIO_TYPE_WRITE) ? ZIO_PRIORITY_ASYNC_WRITE : ZIO_PRIORITY_ASYNC_READ); } vs->vs_ops[vs_type]++; vs->vs_bytes[vs_type] += psize; if (flags & ZIO_FLAG_DELEGATED) { vsx->vsx_agg_histo[priority] [RQ_HISTO(zio->io_size)]++; } else { vsx->vsx_ind_histo[priority] [RQ_HISTO(zio->io_size)]++; } if (zio->io_delta && zio->io_delay) { vsx->vsx_queue_histo[priority] [L_HISTO(zio->io_delta - zio->io_delay)]++; vsx->vsx_disk_histo[type] [L_HISTO(zio->io_delay)]++; vsx->vsx_total_histo[type] [L_HISTO(zio->io_delta)]++; } } mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; /* * If this is an I/O error that is going to be retried, then ignore the * error. Otherwise, the user may interpret B_FAILFAST I/O errors as * hard errors, when in reality they can happen for any number of * innocuous reasons (bus resets, MPxIO link failure, etc). */ if (zio->io_error == EIO && !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; /* * Intent logs writes won't propagate their error to the root * I/O so don't mark these types of failures as pool-level * errors. */ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; if (spa->spa_load_state == SPA_LOAD_NONE && type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair * made by zil_claim() during spa_load() in the first txg. * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, * so we commit the DTL change in spa_syncing_txg(spa). * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } int64_t vdev_deflated_space(vdev_t *vd, int64_t space) { ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); } /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { int64_t dspace_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(vd == vd->vdev_top); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its * children's, thus not accurate enough for us. */ dspace_delta = vdev_deflated_space(vd, space_delta); mutex_enter(&vd->vdev_stat_lock); /* ensure we won't underflow */ if (alloc_delta < 0) { ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta); } vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); /* every class but log contributes to root space stats */ if (vd->vdev_mg != NULL && !vd->vdev_islog) { ASSERT(!vd->vdev_isl2cache); mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } /* Note: metaslab_class_space_update moved to metaslab_space_update */ } /* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. */ void vdev_config_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_writeable(spa)); /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; nvlist_t **aux; uint_t naux; for (c = 0; c < sav->sav_count; c++) { if (sav->sav_vdevs[c] == vd) break; } if (c == sav->sav_count) { /* * We're being removed. There's nothing more to do. */ ASSERT(sav->sav_sync == B_TRUE); return; } sav->sav_sync = B_TRUE; if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); } ASSERT(c < naux); /* * Setting the nvlist in the middle if the array is a little * sketchy, but it will work. */ nvlist_free(aux[c]); aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } /* * The dirty list is protected by the SCL_CONFIG lock. The caller * must either hold SCL_CONFIG as writer, or must be the sync thread * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) vdev_config_dirty(rvd->vdev_child[c]); } else { ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && vdev_is_concrete(vd)) { list_insert_head(&spa->spa_config_dirty_list, vd); } } } void vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); ASSERT(list_link_active(&vd->vdev_config_dirty_node)); list_remove(&spa->spa_config_dirty_list, vd); } /* * Mark a top-level vdev's state as dirty, so that the next pass of * spa_sync() can convert this into vdev_config_dirty(). We distinguish * the state changes from larger config changes because they require * much less locking, and are often needed for administrative actions. */ void vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* * The state list is protected by the SCL_STATE lock. The caller * must either hold SCL_STATE as writer, or must be the sync thread * (which holds SCL_STATE as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); if (!list_link_active(&vd->vdev_state_dirty_node) && vdev_is_concrete(vd)) list_insert_head(&spa->spa_state_dirty_list, vd); } void vdev_state_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); ASSERT(list_link_active(&vd->vdev_state_dirty_node)); list_remove(&spa->spa_state_dirty_list, vd); } /* * Propagate vdev state up from children to parent. */ void vdev_propagate_state(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; if (vd->vdev_children > 0) { for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; /* * Don't factor holes or indirect vdevs into the * decision. */ if (!vdev_is_concrete(child)) continue; if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were * degraded. */ if (child->vdev_islog && vd == rvd) degraded++; else faulted++; } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) corrupted++; } vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); /* * Root special: if there is a top-level vdev that cannot be * opened due to corrupted metadata, then propagate the root * vdev's aux state as 'corrupt' rather than 'insufficient * replicas'. */ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } if (vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. * Otherwise, we propagate the change to the parent. * * If this routine places a device in a faulted state, an appropriate ereport is * generated. */ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { /* * Since vdev_offline() code path is already in an offline * state we can miss a statechange event to OFFLINE. Check * the previous state to catch this condition. */ if (vd->vdev_ops->vdev_op_leaf && (state == VDEV_STATE_OFFLINE) && (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) { /* post an offline state change */ zfs_post_state_change(spa, vd, vd->vdev_prevstate); } vd->vdev_stat.vs_aux = aux; return; } save_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; /* * If we are setting the vdev state to anything but an open state, then * always close the underlying device unless the device has requested * a delayed close (i.e. we're about to remove or fault the device). * Otherwise, we keep accessible but invalid devices open forever. * We don't call vdev_close() itself, because that implies some extra * checks (offline, etc) that we don't want here. This is limited to * leaf devices, because otherwise closing the device will affect other * children. */ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { /* * If the previous state is set to VDEV_STATE_REMOVED, then this * device was previously marked removed and someone attempted to * reopen it. If this failed due to a nonexistent device, then * keep the device in the REMOVED state. We also let this be if * it is one of our special test online cases, which is only * attempting to online the device and shouldn't generate an FMA * fault. */ vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import or recovery, we * mark it as "not available", which signifies that it was * never there to begin with. Failure to open such a device * is not considered an error. */ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; /* * Post the appropriate ereport. If the 'prevstate' field is * set to something other than VDEV_STATE_UNKNOWN, it indicates * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. * * If the 'checkremove' flag is set, then this is an attempt to * online the device in response to an insertion event. If we * hit this case, then we have detected an insertion event for a * faulted or offline device that wasn't in the removed state. * In this scenario, we don't post an ereport because we are * about to replace the device, or attempt an online with * vdev_forcefault, which will generate the fault for us. */ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && !vd->vdev_not_present && !vd->vdev_checkremove && vd != spa->spa_root_vdev) { const char *class; switch (aux) { case VDEV_AUX_OPEN_FAILED: class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; break; case VDEV_AUX_CORRUPT_DATA: class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; break; case VDEV_AUX_NO_REPLICAS: class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; break; case VDEV_AUX_BAD_GUID_SUM: class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; break; case VDEV_AUX_TOO_SMALL: class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; break; case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; case VDEV_AUX_BAD_ASHIFT: class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT; break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } - zfs_ereport_post(class, spa, vd, NULL, NULL, + (void) zfs_ereport_post(class, spa, vd, NULL, NULL, save_state, 0); } /* Erase any notion of persistent removed state */ vd->vdev_removed = B_FALSE; } else { vd->vdev_removed = B_FALSE; } /* * Notify ZED of any significant state-change on a leaf vdev. * */ if (vd->vdev_ops->vdev_op_leaf) { /* preserve original state from a vdev_reopen() */ if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) && (vd->vdev_prevstate != vd->vdev_state) && (save_state <= VDEV_STATE_CLOSED)) save_state = vd->vdev_prevstate; /* filter out state change due to initial vdev_open */ if (save_state > VDEV_STATE_CLOSED) zfs_post_state_change(spa, vd, save_state); } if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } boolean_t vdev_children_are_offline(vdev_t *vd) { ASSERT(!vd->vdev_ops->vdev_op_leaf); for (uint64_t i = 0; i < vd->vdev_children; i++) { if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) return (B_FALSE); } return (B_TRUE); } /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. We do not support partial configuration. */ boolean_t vdev_is_bootable(vdev_t *vd) { if (!vd->vdev_ops->vdev_op_leaf) { const char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 || strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) { return (B_FALSE); } } for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t vdev_is_concrete(vdev_t *vd) { vdev_ops_t *ops = vd->vdev_ops; if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || ops == &vdev_missing_ops || ops == &vdev_root_ops) { return (B_FALSE); } else { return (B_TRUE); } } /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that * the content on the log device has already been written to the pool. */ boolean_t vdev_log_state_valid(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && !vd->vdev_removed) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_log_state_valid(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Expand a vdev if possible. */ void vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vdev_is_concrete(vd)); vdev_set_deflate_ratio(vd); if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } } /* * Split a vdev. */ void vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; vdev_remove_child(pvd, vd); vdev_compact_children(pvd); cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); cvd->vdev_splitting = B_TRUE; } vdev_propagate_state(cvd); } void vdev_deadman(vdev_t *vd, char *tag) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_deadman(cvd, tag); } if (vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); if (avl_numnodes(&vq->vq_active_tree) > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; zfs_dbgmsg("slow vdev: %s has %d active IOs", vd->vdev_path, avl_numnodes(&vq->vq_active_tree)); /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ fio = avl_first(&vq->vq_active_tree); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); } mutex_exit(&vq->vq_lock); } } void vdev_defer_resilver(vdev_t *vd) { ASSERT(vd->vdev_ops->vdev_op_leaf); vd->vdev_resilver_deferred = B_TRUE; vd->vdev_spa->spa_resilver_deferred = B_TRUE; } /* * Clears the resilver deferred flag on all leaf devs under vd. Returns * B_TRUE if we have devices that need to be resilvered and are available to * accept resilver I/Os. */ boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) { boolean_t resilver_needed = B_FALSE; spa_t *spa = vd->vdev_spa; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; resilver_needed |= vdev_clear_resilver_deferred(cvd, tx); } if (vd == spa->spa_root_vdev && spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); vdev_config_dirty(vd); spa->spa_resilver_deferred = B_FALSE; return (resilver_needed); } if (!vdev_is_concrete(vd) || vd->vdev_aux || !vd->vdev_ops->vdev_op_leaf) return (resilver_needed); vd->vdev_resilver_deferred = B_FALSE; return (!vdev_is_dead(vd) && !vd->vdev_offline && vdev_resilver_needed(vd, NULL, NULL)); } /* * Translate a logical range to the physical range for the specified vdev_t. * This function is initially called with a leaf vdev and will walk each * parent vdev until it reaches a top-level vdev. Once the top-level is * reached the physical range is initialized and the recursive function * begins to unwind. As it unwinds it calls the parent's vdev specific * translation function to do the real conversion. */ void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs) { /* * Walk up the vdev tree */ if (vd != vd->vdev_top) { vdev_xlate(vd->vdev_parent, logical_rs, physical_rs); } else { /* * We've reached the top-level vdev, initialize the * physical range to the logical range and start to * unwind. */ physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; return; } vdev_t *pvd = vd->vdev_parent; ASSERT3P(pvd, !=, NULL); ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); /* * As this recursive function unwinds, translate the logical * range into its physical components by calling the * vdev specific translate function. */ range_seg64_t intermediate = { 0 }; pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); physical_rs->rs_start = intermediate.rs_start; physical_rs->rs_end = intermediate.rs_end; } /* * Look at the vdev tree and determine whether any devices are currently being * replaced. */ boolean_t vdev_replace_in_progress(vdev_t *vdev) { ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0); if (vdev->vdev_ops == &vdev_replacing_ops) return (B_TRUE); /* * A 'spare' vdev indicates that we have a replace in progress, unless * it has exactly two children, and the second, the hot spare, has * finished being resilvered. */ if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 || !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING))) return (B_TRUE); for (int i = 0; i < vdev->vdev_children; i++) { if (vdev_replace_in_progress(vdev->vdev_child[i])) return (B_TRUE); } return (B_FALSE); } EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, INT, ZMOD_RW, "Target number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, INT, ZMOD_RW, "Default limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, INT, ZMOD_RW, "Minimum number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, INT, ZMOD_RW, "Practical upper limit of total metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, "Rate limit slow IO (delay) events to this many per second"); ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " "(do not set below zed threshold)."); ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW, "Ignore errors during resilver/scrub"); ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW, "Bypass vdev_validate()"); ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, "Disable cache flushes"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, param_set_min_auto_ashift, param_get_ulong, ZMOD_RW, "Minimum ashift used when creating new top-level vdevs"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, param_set_max_auto_ashift, param_get_ulong, ZMOD_RW, "Maximum ashift used when optimizing for logical -> physical sector " "size on new top-level vdevs"); /* END CSTYLED */ Index: vendor-sys/openzfs/dist/module/zfs/vdev_indirect.c =================================================================== --- vendor-sys/openzfs/dist/module/zfs/vdev_indirect.c (revision 365339) +++ vendor-sys/openzfs/dist/module/zfs/vdev_indirect.c (revision 365340) @@ -1,1890 +1,1890 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2014, 2017 by Delphix. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. * Copyright (c) 2014, 2019 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * An indirect vdev corresponds to a vdev that has been removed. Since * we cannot rewrite block pointers of snapshots, etc., we keep a * mapping from old location on the removed device to the new location * on another device in the pool and use this mapping whenever we need * to access the DVA. Unfortunately, this mapping did not respect * logical block boundaries when it was first created, and so a DVA on * this indirect vdev may be "split" into multiple sections that each * map to a different location. As a consequence, not all DVAs can be * translated to an equivalent new DVA. Instead we must provide a * "vdev_remap" operation that executes a callback on each contiguous * segment of the new location. This function is used in multiple ways: * * - i/os to this vdev use the callback to determine where the * data is now located, and issue child i/os for each segment's new * location. * * - frees and claims to this vdev use the callback to free or claim * each mapped segment. (Note that we don't actually need to claim * log blocks on indirect vdevs, because we don't allocate to * removing vdevs. However, zdb uses zio_claim() for its leak * detection.) */ /* * "Big theory statement" for how we mark blocks obsolete. * * When a block on an indirect vdev is freed or remapped, a section of * that vdev's mapping may no longer be referenced (aka "obsolete"). We * keep track of how much of each mapping entry is obsolete. When * an entry becomes completely obsolete, we can remove it, thus reducing * the memory used by the mapping. The complete picture of obsolescence * is given by the following data structures, described below: * - the entry-specific obsolete count * - the vdev-specific obsolete spacemap * - the pool-specific obsolete bpobj * * == On disk data structures used == * * We track the obsolete space for the pool using several objects. Each * of these objects is created on demand and freed when no longer * needed, and is assumed to be empty if it does not exist. * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. * * - Each vic_mapping_object (associated with an indirect vdev) can * have a vimp_counts_object. This is an array of uint32_t's * with the same number of entries as the vic_mapping_object. When * the mapping is condensed, entries from the vic_obsolete_sm_object * (see below) are folded into the counts. Therefore, each * obsolete_counts entry tells us the number of bytes in the * corresponding mapping entry that were not referenced when the * mapping was last condensed. * * - Each indirect or removing vdev can have a vic_obsolete_sm_object. * This is a space map containing an alloc entry for every DVA that * has been obsoleted since the last time this indirect vdev was * condensed. We use this object in order to improve performance * when marking a DVA as obsolete. Instead of modifying an arbitrary * offset of the vimp_counts_object, we only need to append an entry * to the end of this object. When a DVA becomes obsolete, it is * added to the obsolete space map. This happens when the DVA is * freed, remapped and not referenced by a snapshot, or the last * snapshot referencing it is destroyed. * * - Each dataset can have a ds_remap_deadlist object. This is a * deadlist object containing all blocks that were remapped in this * dataset but referenced in a previous snapshot. Blocks can *only* * appear on this list if they were remapped (dsl_dataset_block_remapped); * blocks that were killed in a head dataset are put on the normal * ds_deadlist and marked obsolete when they are freed. * * - The pool can have a dp_obsolete_bpobj. This is a list of blocks * in the pool that need to be marked obsolete. When a snapshot is * destroyed, we move some of the ds_remap_deadlist to the obsolete * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then * asynchronously process the obsolete bpobj, moving its entries to * the specific vdevs' obsolete space maps. * * == Summary of how we mark blocks as obsolete == * * - When freeing a block: if any DVA is on an indirect vdev, append to * vic_obsolete_sm_object. * - When remapping a block, add dva to ds_remap_deadlist (if prev snap * references; otherwise append to vic_obsolete_sm_object). * - When freeing a snapshot: move parts of ds_remap_deadlist to * dp_obsolete_bpobj (same algorithm as ds_deadlist). * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to * individual vdev's vic_obsolete_sm_object. */ /* * "Big theory statement" for how we condense indirect vdevs. * * Condensing an indirect vdev's mapping is the process of determining * the precise counts of obsolete space for each mapping entry (by * integrating the obsolete spacemap into the obsolete counts) and * writing out a new mapping that contains only referenced entries. * * We condense a vdev when we expect the mapping to shrink (see * vdev_indirect_should_condense()), but only perform one condense at a * time to limit the memory usage. In addition, we use a separate * open-context thread (spa_condense_indirect_thread) to incrementally * create the new mapping object in a way that minimizes the impact on * the rest of the system. * * == Generating a new mapping == * * To generate a new mapping, we follow these steps: * * 1. Save the old obsolete space map and create a new mapping object * (see spa_condense_indirect_start_sync()). This initializes the * spa_condensing_indirect_phys with the "previous obsolete space map", * which is now read only. Newly obsolete DVAs will be added to a * new (initially empty) obsolete space map, and will not be * considered as part of this condense operation. * * 2. Construct in memory the precise counts of obsolete space for each * mapping entry, by incorporating the obsolete space map into the * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) * * 3. Iterate through each mapping entry, writing to the new mapping any * entries that are not completely obsolete (i.e. which don't have * obsolete count == mapping length). (See * spa_condense_indirect_generate_new_mapping().) * * 4. Destroy the old mapping object and switch over to the new one * (spa_condense_indirect_complete_sync). * * == Restarting from failure == * * To restart the condense when we import/open the pool, we must start * at the 2nd step above: reconstruct the precise counts in memory, * based on the space map + counts. Then in the 3rd step, we start * iterating where we left off: at vimp_max_offset of the new mapping * object. */ int zfs_condense_indirect_vdevs_enable = B_TRUE; /* * Condense if at least this percent of the bytes in the mapping is * obsolete. With the default of 25%, the amount of space mapped * will be reduced to 1% of its original size after at most 16 * condenses. Higher values will condense less often (causing less * i/o); lower values will reduce the mapping size more quickly. */ int zfs_indirect_condense_obsolete_pct = 25; /* * Condense if the obsolete space map takes up more than this amount of * space on disk (logically). This limits the amount of disk space * consumed by the obsolete space map; the default of 1GB is small enough * that we typically don't mind "wasting" it. */ unsigned long zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; /* * Don't bother condensing if the mapping uses less than this amount of * memory. The default of 128KB is considered a "trivial" amount of * memory and not worth reducing. */ unsigned long zfs_condense_min_mapping_bytes = 128 * 1024; /* * This is used by the test suite so that it can ensure that certain * actions happen while in the middle of a condense (which might otherwise * complete too quickly). If used to reduce the performance impact of * condensing in production, a maximum value of 1 should be sufficient. */ int zfs_condense_indirect_commit_entry_delay_ms = 0; /* * If an indirect split block contains more than this many possible unique * combinations when being reconstructed, consider it too computationally * expensive to check them all. Instead, try at most 100 randomly-selected * combinations each time the block is accessed. This allows all segment * copies to participate fairly in the reconstruction when all combinations * cannot be checked and prevents repeated use of one bad copy. */ int zfs_reconstruct_indirect_combinations_max = 4096; /* * Enable to simulate damaged segments and validate reconstruction. This * is intentionally not exposed as a module parameter. */ unsigned long zfs_reconstruct_indirect_damage_fraction = 0; /* * The indirect_child_t represents the vdev that we will read from, when we * need to read all copies of the data (e.g. for scrub or reconstruction). * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, * ic_vdev is a child of the mirror. */ typedef struct indirect_child { abd_t *ic_data; vdev_t *ic_vdev; /* * ic_duplicate is NULL when the ic_data contents are unique, when it * is determined to be a duplicate it references the primary child. */ struct indirect_child *ic_duplicate; list_node_t ic_node; /* node on is_unique_child */ } indirect_child_t; /* * The indirect_split_t represents one mapped segment of an i/o to the * indirect vdev. For non-split (contiguously-mapped) blocks, there will be * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. * For split blocks, there will be several of these. */ typedef struct indirect_split { list_node_t is_node; /* link on iv_splits */ /* * is_split_offset is the offset into the i/o. * This is the sum of the previous splits' is_size's. */ uint64_t is_split_offset; vdev_t *is_vdev; /* top-level vdev */ uint64_t is_target_offset; /* offset on is_vdev */ uint64_t is_size; int is_children; /* number of entries in is_child[] */ int is_unique_children; /* number of entries in is_unique_child */ list_t is_unique_child; /* * is_good_child is the child that we are currently using to * attempt reconstruction. */ indirect_child_t *is_good_child; indirect_child_t is_child[1]; /* variable-length */ } indirect_split_t; /* * The indirect_vsd_t is associated with each i/o to the indirect vdev. * It is the "Vdev-Specific Data" in the zio_t's io_vsd. */ typedef struct indirect_vsd { boolean_t iv_split_block; boolean_t iv_reconstruct; uint64_t iv_unique_combinations; uint64_t iv_attempts; uint64_t iv_attempts_max; list_t iv_splits; /* list of indirect_split_t's */ } indirect_vsd_t; static void vdev_indirect_map_free(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; indirect_split_t *is; while ((is = list_head(&iv->iv_splits)) != NULL) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic->ic_data != NULL) abd_free(ic->ic_data); } list_remove(&iv->iv_splits, is); indirect_child_t *ic; while ((ic = list_head(&is->is_unique_child)) != NULL) list_remove(&is->is_unique_child, ic); list_destroy(&is->is_unique_child); kmem_free(is, offsetof(indirect_split_t, is_child[is->is_children])); } kmem_free(iv, sizeof (*iv)); } static const zio_vsd_ops_t vdev_indirect_vsd_ops = { .vsd_free = vdev_indirect_map_free, .vsd_cksum_report = zio_vsd_default_cksum_report }; /* * Mark the given offset and size as being obsolete. */ void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size) { spa_t *spa = vd->vdev_spa; ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(size > 0); VERIFY(vdev_indirect_mapping_entry_for_offset( vd->vdev_indirect_mapping, offset) != NULL); if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { mutex_enter(&vd->vdev_obsolete_lock); range_tree_add(vd->vdev_obsolete_segments, offset, size); mutex_exit(&vd->vdev_obsolete_lock); vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa)); } } /* * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This * wrapper is provided because the DMU does not know about vdev_t's and * cannot directly call vdev_indirect_mark_obsolete. */ void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, uint64_t size, dmu_tx_t *tx) { vdev_t *vd = vdev_lookup_top(spa, vdev_id); ASSERT(dmu_tx_is_syncing(tx)); /* The DMU can only remap indirect vdevs. */ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); vdev_indirect_mark_obsolete(vd, offset, size); } static spa_condensing_indirect_t * spa_condensing_indirect_create(spa_t *spa) { spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); objset_t *mos = spa->spa_meta_objset; for (int i = 0; i < TXG_SIZE; i++) { list_create(&sci->sci_new_mapping_entries[i], sizeof (vdev_indirect_mapping_entry_t), offsetof(vdev_indirect_mapping_entry_t, vime_node)); } sci->sci_new_mapping = vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); return (sci); } static void spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) { for (int i = 0; i < TXG_SIZE; i++) list_destroy(&sci->sci_new_mapping_entries[i]); if (sci->sci_new_mapping != NULL) vdev_indirect_mapping_close(sci->sci_new_mapping); kmem_free(sci, sizeof (*sci)); } boolean_t vdev_indirect_should_condense(vdev_t *vd) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; spa_t *spa = vd->vdev_spa; ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); if (!zfs_condense_indirect_vdevs_enable) return (B_FALSE); /* * We can only condense one indirect vdev at a time. */ if (spa->spa_condensing_indirect != NULL) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); /* * The mapping object size must not change while we are * condensing, so we can only condense indirect vdevs * (not vdevs that are still in the middle of being removed). */ if (vd->vdev_ops != &vdev_indirect_ops) return (B_FALSE); /* * If nothing new has been marked obsolete, there is no * point in condensing. */ uint64_t obsolete_sm_obj __maybe_unused; ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj)); if (vd->vdev_obsolete_sm == NULL) { ASSERT0(obsolete_sm_obj); return (B_FALSE); } ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(obsolete_sm_obj, ==, space_map_object(vd->vdev_obsolete_sm)); uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); uint64_t mapping_size = vdev_indirect_mapping_size(vim); uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); ASSERT3U(bytes_obsolete, <=, bytes_mapped); /* * If a high percentage of the bytes that are mapped have become * obsolete, condense (unless the mapping is already small enough). * This has a good chance of reducing the amount of memory used * by the mapping. */ if (bytes_obsolete * 100 / bytes_mapped >= zfs_indirect_condense_obsolete_pct && mapping_size > zfs_condense_min_mapping_bytes) { zfs_dbgmsg("should condense vdev %llu because obsolete " "spacemap covers %d%% of %lluMB mapping", (u_longlong_t)vd->vdev_id, (int)(bytes_obsolete * 100 / bytes_mapped), (u_longlong_t)bytes_mapped / 1024 / 1024); return (B_TRUE); } /* * If the obsolete space map takes up too much space on disk, * condense in order to free up this disk space. */ if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { zfs_dbgmsg("should condense vdev %llu because obsolete sm " "length %lluMB >= max size %lluMB", (u_longlong_t)vd->vdev_id, (u_longlong_t)obsolete_sm_size / 1024 / 1024, (u_longlong_t)zfs_condense_max_obsolete_bytes / 1024 / 1024); return (B_TRUE); } return (B_FALSE); } /* * This sync task completes (finishes) a condense, deleting the old * mapping and replacing it with the new one. */ static void spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) { spa_condensing_indirect_t *sci = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; objset_t *mos = spa->spa_meta_objset; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); uint64_t new_count = vdev_indirect_mapping_num_entries(sci->sci_new_mapping); ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT3P(sci, ==, spa->spa_condensing_indirect); for (int i = 0; i < TXG_SIZE; i++) { ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); } ASSERT(vic->vic_mapping_object != 0); ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); ASSERT(scip->scip_next_mapping_object != 0); ASSERT(scip->scip_prev_obsolete_sm_object != 0); /* * Reset vdev_indirect_mapping to refer to the new object. */ rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vd->vdev_indirect_mapping = sci->sci_new_mapping; rw_exit(&vd->vdev_indirect_rwlock); sci->sci_new_mapping = NULL; vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); vic->vic_mapping_object = scip->scip_next_mapping_object; scip->scip_next_mapping_object = 0; space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); scip->scip_prev_obsolete_sm_object = 0; scip->scip_vdev = 0; VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, tx)); spa_condensing_indirect_destroy(spa->spa_condensing_indirect); spa->spa_condensing_indirect = NULL; zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " "new mapping object %llu has %llu entries " "(was %llu entries)", vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, new_count, old_count); vdev_config_dirty(spa->spa_root_vdev); } /* * This sync task appends entries to the new mapping object. */ static void spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) { spa_condensing_indirect_t *sci = arg; uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa __maybe_unused = dmu_tx_pool(tx)->dp_spa; ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(sci, ==, spa->spa_condensing_indirect); vdev_indirect_mapping_add_entries(sci->sci_new_mapping, &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); } /* * Open-context function to add one entry to the new mapping. The new * entry will be remembered and written from syncing context. */ static void spa_condense_indirect_commit_entry(spa_t *spa, vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) { spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; /* * If we are the first entry committed this txg, kick off the sync * task to write to the MOS on our behalf. */ if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { dsl_sync_task_nowait(dmu_tx_pool(tx), spa_condense_indirect_commit_sync, sci, 0, ZFS_SPACE_CHECK_NONE, tx); } vdev_indirect_mapping_entry_t *vime = kmem_alloc(sizeof (*vime), KM_SLEEP); vime->vime_mapping = *vimep; vime->vime_obsolete_count = count; list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); dmu_tx_commit(tx); } static void spa_condense_indirect_generate_new_mapping(vdev_t *vd, uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr) { spa_t *spa = vd->vdev_spa; uint64_t mapi = start_index; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; uint64_t old_num_entries = vdev_indirect_mapping_num_entries(old_mapping); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); zfs_dbgmsg("starting condense of vdev %llu from index %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)mapi); while (mapi < old_num_entries) { if (zthr_iscancelled(zthr)) { zfs_dbgmsg("pausing condense of vdev %llu " "at index %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)mapi); break; } vdev_indirect_mapping_entry_phys_t *entry = &old_mapping->vim_entries[mapi]; uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); ASSERT3U(obsolete_counts[mapi], <=, entry_size); if (obsolete_counts[mapi] < entry_size) { spa_condense_indirect_commit_entry(spa, entry, obsolete_counts[mapi]); /* * This delay may be requested for testing, debugging, * or performance reasons. */ hrtime_t now = gethrtime(); hrtime_t sleep_until = now + MSEC2NSEC( zfs_condense_indirect_commit_entry_delay_ms); zfs_sleep_until(sleep_until); } mapi++; } } /* ARGSUSED */ static boolean_t spa_condense_indirect_thread_check(void *arg, zthr_t *zthr) { spa_t *spa = arg; return (spa->spa_condensing_indirect != NULL); } /* ARGSUSED */ static void spa_condense_indirect_thread(void *arg, zthr_t *zthr) { spa_t *spa = arg; vdev_t *vd; ASSERT3P(spa->spa_condensing_indirect, !=, NULL); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev); ASSERT3P(vd, !=, NULL); spa_config_exit(spa, SCL_VDEV, FTAG); spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; uint32_t *counts; uint64_t start_index; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; space_map_t *prev_obsolete_sm = NULL; ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); ASSERT(scip->scip_next_mapping_object != 0); ASSERT(scip->scip_prev_obsolete_sm_object != 0); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); for (int i = 0; i < TXG_SIZE; i++) { /* * The list must start out empty in order for the * _commit_sync() sync task to be properly registered * on the first call to _commit_entry(); so it's wise * to double check and ensure we actually are starting * with empty lists. */ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); } VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); if (prev_obsolete_sm != NULL) { vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, counts, prev_obsolete_sm); } space_map_close(prev_obsolete_sm); /* * Generate new mapping. Determine what index to continue from * based on the max offset that we've already written in the * new mapping. */ uint64_t max_offset = vdev_indirect_mapping_max_offset(sci->sci_new_mapping); if (max_offset == 0) { /* We haven't written anything to the new mapping yet. */ start_index = 0; } else { /* * Pick up from where we left off. _entry_for_offset() * returns a pointer into the vim_entries array. If * max_offset is greater than any of the mappings * contained in the table NULL will be returned and * that indicates we've exhausted our iteration of the * old_mapping. */ vdev_indirect_mapping_entry_phys_t *entry = vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, max_offset); if (entry == NULL) { /* * We've already written the whole new mapping. * This special value will cause us to skip the * generate_new_mapping step and just do the sync * task to complete the condense. */ start_index = UINT64_MAX; } else { start_index = entry - old_mapping->vim_entries; ASSERT3U(start_index, <, vdev_indirect_mapping_num_entries(old_mapping)); } } spa_condense_indirect_generate_new_mapping(vd, counts, start_index, zthr); vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); /* * If the zthr has received a cancellation signal while running * in generate_new_mapping() or at any point after that, then bail * early. We don't want to complete the condense if the spa is * shutting down. */ if (zthr_iscancelled(zthr)) return; VERIFY0(dsl_sync_task(spa_name(spa), NULL, spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } /* * Sync task to begin the condensing process. */ void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; ASSERT0(scip->scip_next_mapping_object); ASSERT0(scip->scip_prev_obsolete_sm_object); ASSERT0(scip->scip_vdev); ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); uint64_t obsolete_sm_obj; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj)); ASSERT3U(obsolete_sm_obj, !=, 0); scip->scip_vdev = vd->vdev_id; scip->scip_next_mapping_object = vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; /* * We don't need to allocate a new space map object, since * vdev_indirect_sync_obsolete will allocate one when needed. */ space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), sizeof (*scip) / sizeof (uint64_t), scip, tx)); ASSERT3P(spa->spa_condensing_indirect, ==, NULL); spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " "posm=%llu nm=%llu", vd->vdev_id, dmu_tx_get_txg(tx), (u_longlong_t)scip->scip_prev_obsolete_sm_object, (u_longlong_t)scip->scip_next_mapping_object); zthr_wakeup(spa->spa_condense_zthr); } /* * Sync to the given vdev's obsolete space map any segments that are no longer * referenced as of the given txg. * * If the obsolete space map doesn't exist yet, create and open it. */ void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config; ASSERT3U(vic->vic_mapping_object, !=, 0); ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); uint64_t obsolete_sm_object; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object == 0) { obsolete_sm_object = space_map_alloc(spa->spa_meta_objset, zfs_vdev_standard_sm_blksz, tx); ASSERT(vd->vdev_top_zap != 0); VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx)); ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); ASSERT3U(obsolete_sm_object, !=, 0); spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); VERIFY0(space_map_open(&vd->vdev_obsolete_sm, spa->spa_meta_objset, obsolete_sm_object, 0, vd->vdev_asize, 0)); } ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(obsolete_sm_object, ==, space_map_object(vd->vdev_obsolete_sm)); space_map_write(vd->vdev_obsolete_sm, vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); } int spa_condense_init(spa_t *spa) { int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), &spa->spa_condensing_indirect_phys); if (error == 0) { if (spa_writeable(spa)) { spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); } return (0); } else if (error == ENOENT) { return (0); } else { return (error); } } void spa_condense_fini(spa_t *spa) { if (spa->spa_condensing_indirect != NULL) { spa_condensing_indirect_destroy(spa->spa_condensing_indirect); spa->spa_condensing_indirect = NULL; } } void spa_start_indirect_condensing_thread(spa_t *spa) { ASSERT3P(spa->spa_condense_zthr, ==, NULL); spa->spa_condense_zthr = zthr_create("z_indirect_condense", spa_condense_indirect_thread_check, spa_condense_indirect_thread, spa); } /* * Gets the obsolete spacemap object from the vdev's ZAP. On success sm_obj * will contain either the obsolete spacemap object or zero if none exists. * All other errors are returned to the caller. */ int vdev_obsolete_sm_object(vdev_t *vd, uint64_t *sm_obj) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *sm_obj = 0; return (0); } int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (uint64_t), 1, sm_obj); if (error == ENOENT) { *sm_obj = 0; error = 0; } return (error); } /* * Gets the obsolete count are precise spacemap object from the vdev's ZAP. * On success are_precise will be set to reflect if the counts are precise. * All other errors are returned to the caller. */ int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *are_precise = B_FALSE; return (0); } uint64_t val = 0; int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val); if (error == 0) { *are_precise = (val != 0); } else if (error == ENOENT) { *are_precise = B_FALSE; error = 0; } return (error); } /* ARGSUSED */ static void vdev_indirect_close(vdev_t *vd) { } /* ARGSUSED */ static int vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { *psize = *max_psize = vd->vdev_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; *logical_ashift = vd->vdev_ashift; *physical_ashift = vd->vdev_physical_ashift; return (0); } typedef struct remap_segment { vdev_t *rs_vd; uint64_t rs_offset; uint64_t rs_asize; uint64_t rs_split_offset; list_node_t rs_node; } remap_segment_t; static remap_segment_t * rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) { remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); rs->rs_vd = vd; rs->rs_offset = offset; rs->rs_asize = asize; rs->rs_split_offset = split_offset; return (rs); } /* * Given an indirect vdev and an extent on that vdev, it duplicates the * physical entries of the indirect mapping that correspond to the extent * to a new array and returns a pointer to it. In addition, copied_entries * is populated with the number of mapping entries that were duplicated. * * Note that the function assumes that the caller holds vdev_indirect_rwlock. * This ensures that the mapping won't change due to condensing as we * copy over its contents. * * Finally, since we are doing an allocation, it is up to the caller to * free the array allocated in this function. */ static vdev_indirect_mapping_entry_phys_t * vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t *copied_entries) { vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t entries = 0; ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock)); vdev_indirect_mapping_entry_phys_t *first_mapping = vdev_indirect_mapping_entry_for_offset(vim, offset); ASSERT3P(first_mapping, !=, NULL); vdev_indirect_mapping_entry_phys_t *m = first_mapping; while (asize > 0) { uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); uint64_t inner_size = MIN(asize, size - inner_offset); offset += inner_size; asize -= inner_size; entries++; m++; } size_t copy_length = entries * sizeof (*first_mapping); duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP); bcopy(first_mapping, duplicate_mappings, copy_length); *copied_entries = entries; return (duplicate_mappings); } /* * Goes through the relevant indirect mappings until it hits a concrete vdev * and issues the callback. On the way to the concrete vdev, if any other * indirect vdevs are encountered, then the callback will also be called on * each of those indirect vdevs. For example, if the segment is mapped to * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is * mapped to segment B on concrete vdev 2, then the callback will be called on * both vdev 1 and vdev 2. * * While the callback passed to vdev_indirect_remap() is called on every vdev * the function encounters, certain callbacks only care about concrete vdevs. * These types of callbacks should return immediately and explicitly when they * are called on an indirect vdev. * * Because there is a possibility that a DVA section in the indirect device * has been split into multiple sections in our mapping, we keep track * of the relevant contiguous segments of the new location (remap_segment_t) * in a stack. This way we can call the callback for each of the new sections * created by a single section of the indirect device. Note though, that in * this scenario the callbacks in each split block won't occur in-order in * terms of offset, so callers should not make any assumptions about that. * * For callbacks that don't handle split blocks and immediately return when * they encounter them (as is the case for remap_blkptr_cb), the caller can * assume that its callback will be applied from the first indirect vdev * encountered to the last one and then the concrete vdev, in that order. */ static void vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg) { list_t stack; spa_t *spa = vd->vdev_spa; list_create(&stack, sizeof (remap_segment_t), offsetof(remap_segment_t, rs_node)); for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); rs != NULL; rs = list_remove_head(&stack)) { vdev_t *v = rs->rs_vd; uint64_t num_entries = 0; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); ASSERT(rs->rs_asize > 0); /* * Note: As this function can be called from open context * (e.g. zio_read()), we need the following rwlock to * prevent the mapping from being changed by condensing. * * So we grab the lock and we make a copy of the entries * that are relevant to the extent that we are working on. * Once that is done, we drop the lock and iterate over * our copy of the mapping. Once we are done with the with * the remap segment and we free it, we also free our copy * of the indirect mapping entries that are relevant to it. * * This way we don't need to wait until the function is * finished with a segment, to condense it. In addition, we * don't need a recursive rwlock for the case that a call to * vdev_indirect_remap() needs to call itself (through the * codepath of its callback) for the same vdev in the middle * of its execution. */ rw_enter(&v->vdev_indirect_rwlock, RW_READER); ASSERT3P(v->vdev_indirect_mapping, !=, NULL); vdev_indirect_mapping_entry_phys_t *mapping = vdev_indirect_mapping_duplicate_adjacent_entries(v, rs->rs_offset, rs->rs_asize, &num_entries); ASSERT3P(mapping, !=, NULL); ASSERT3U(num_entries, >, 0); rw_exit(&v->vdev_indirect_rwlock); for (uint64_t i = 0; i < num_entries; i++) { /* * Note: the vdev_indirect_mapping can not change * while we are running. It only changes while the * removal is in progress, and then only from syncing * context. While a removal is in progress, this * function is only called for frees, which also only * happen from syncing context. */ vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; ASSERT3P(m, !=, NULL); ASSERT3U(rs->rs_asize, >, 0); uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); ASSERT3U(rs->rs_offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); ASSERT3U(rs->rs_offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); ASSERT3U(dst_vdev, !=, v->vdev_id); uint64_t inner_offset = rs->rs_offset - DVA_MAPPING_GET_SRC_OFFSET(m); uint64_t inner_size = MIN(rs->rs_asize, size - inner_offset); vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); ASSERT3P(dst_v, !=, NULL); if (dst_v->vdev_ops == &vdev_indirect_ops) { list_insert_head(&stack, rs_alloc(dst_v, dst_offset + inner_offset, inner_size, rs->rs_split_offset)); } if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) && IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) { /* * Note: This clause exists only solely for * testing purposes. We use it to ensure that * split blocks work and that the callbacks * using them yield the same result if issued * in reverse order. */ uint64_t inner_half = inner_size / 2; func(rs->rs_split_offset + inner_half, dst_v, dst_offset + inner_offset + inner_half, inner_half, arg); func(rs->rs_split_offset, dst_v, dst_offset + inner_offset, inner_half, arg); } else { func(rs->rs_split_offset, dst_v, dst_offset + inner_offset, inner_size, arg); } rs->rs_offset += inner_size; rs->rs_asize -= inner_size; rs->rs_split_offset += inner_size; } VERIFY0(rs->rs_asize); kmem_free(mapping, num_entries * sizeof (*mapping)); kmem_free(rs, sizeof (remap_segment_t)); } list_destroy(&stack); } static void vdev_indirect_child_io_done(zio_t *zio) { zio_t *pio = zio->io_private; mutex_enter(&pio->io_lock); pio->io_error = zio_worst_error(pio->io_error, zio->io_error); mutex_exit(&pio->io_lock); abd_put(zio->io_abd); } /* * This is a callback for vdev_indirect_remap() which allocates an * indirect_split_t for each split segment and adds it to iv_splits. */ static void vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { zio_t *zio = arg; indirect_vsd_t *iv = zio->io_vsd; ASSERT3P(vd, !=, NULL); if (vd->vdev_ops == &vdev_indirect_ops) return; int n = 1; if (vd->vdev_ops == &vdev_mirror_ops) n = vd->vdev_children; indirect_split_t *is = kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP); is->is_children = n; is->is_size = size; is->is_split_offset = split_offset; is->is_target_offset = offset; is->is_vdev = vd; list_create(&is->is_unique_child, sizeof (indirect_child_t), offsetof(indirect_child_t, ic_node)); /* * Note that we only consider multiple copies of the data for * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even * though they use the same ops as mirror, because there's only one * "good" copy under the replacing/spare. */ if (vd->vdev_ops == &vdev_mirror_ops) { for (int i = 0; i < n; i++) { is->is_child[i].ic_vdev = vd->vdev_child[i]; list_link_init(&is->is_child[i].ic_node); } } else { is->is_child[0].ic_vdev = vd; } list_insert_tail(&iv->iv_splits, is); } static void vdev_indirect_read_split_done(zio_t *zio) { indirect_child_t *ic = zio->io_private; if (zio->io_error != 0) { /* * Clear ic_data to indicate that we do not have data for this * child. */ abd_free(ic->ic_data); ic->ic_data = NULL; } } /* * Issue reads for all copies (mirror children) of all splits. */ static void vdev_indirect_read_all(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int i = 0; i < is->is_children; i++) { indirect_child_t *ic = &is->is_child[i]; if (!vdev_readable(ic->ic_vdev)) continue; /* * Note, we may read from a child whose DTL * indicates that the data may not be present here. * While this might result in a few i/os that will * likely return incorrect data, it simplifies the * code since we can treat scrub and resilver * identically. (The incorrect data will be * detected and ignored when we verify the * checksum.) */ ic->ic_data = abd_alloc_sametype(zio->io_abd, is->is_size); ic->ic_duplicate = NULL; zio_nowait(zio_vdev_child_io(zio, NULL, ic->ic_vdev, is->is_target_offset, ic->ic_data, is->is_size, zio->io_type, zio->io_priority, 0, vdev_indirect_read_split_done, ic)); } } iv->iv_reconstruct = B_TRUE; } static void vdev_indirect_io_start(zio_t *zio) { spa_t *spa __maybe_unused = zio->io_spa; indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP); list_create(&iv->iv_splits, sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); zio->io_vsd = iv; zio->io_vsd_ops = &vdev_indirect_vsd_ops; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (zio->io_type != ZIO_TYPE_READ) { ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); /* * Note: this code can handle other kinds of writes, * but we don't expect them. */ ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0); } vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, vdev_indirect_gather_splits, zio); indirect_split_t *first = list_head(&iv->iv_splits); if (first->is_size == zio->io_size) { /* * This is not a split block; we are pointing to the entire * data, which will checksum the same as the original data. * Pass the BP down so that the child i/o can verify the * checksum, and try a different location if available * (e.g. on a mirror). * * While this special case could be handled the same as the * general (split block) case, doing it this way ensures * that the vast majority of blocks on indirect vdevs * (which are not split) are handled identically to blocks * on non-indirect vdevs. This allows us to be less strict * about performance in the general (but rare) case. */ ASSERT0(first->is_split_offset); ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL); zio_nowait(zio_vdev_child_io(zio, zio->io_bp, first->is_vdev, first->is_target_offset, abd_get_offset(zio->io_abd, 0), zio->io_size, zio->io_type, zio->io_priority, 0, vdev_indirect_child_io_done, zio)); } else { iv->iv_split_block = B_TRUE; if (zio->io_type == ZIO_TYPE_READ && zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { /* * Read all copies. Note that for simplicity, * we don't bother consulting the DTL in the * resilver case. */ vdev_indirect_read_all(zio); } else { /* * If this is a read zio, we read one copy of each * split segment, from the top-level vdev. Since * we don't know the checksum of each split * individually, the child zio can't ensure that * we get the right data. E.g. if it's a mirror, * it will just read from a random (healthy) leaf * vdev. We have to verify the checksum in * vdev_indirect_io_done(). * * For write zios, the vdev code will ensure we write * to all children. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { zio_nowait(zio_vdev_child_io(zio, NULL, is->is_vdev, is->is_target_offset, abd_get_offset(zio->io_abd, is->is_split_offset), is->is_size, zio->io_type, zio->io_priority, 0, vdev_indirect_child_io_done, zio)); } } } zio_execute(zio); } /* * Report a checksum error for a child. */ static void vdev_indirect_checksum_error(zio_t *zio, indirect_split_t *is, indirect_child_t *ic) { vdev_t *vd = ic->ic_vdev; if (zio->io_flags & ZIO_FLAG_SPECULATIVE) return; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); zio_bad_cksum_t zbc = {{{ 0 }}}; abd_t *bad_abd = ic->ic_data; abd_t *good_abd = is->is_good_child->ic_data; - zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, + (void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc); } /* * Issue repair i/os for any incorrect copies. We do this by comparing * each split segment's correct data (is_good_child's ic_data) with each * other copy of the data. If they differ, then we overwrite the bad data * with the good copy. Note that we do this without regard for the DTL's, * which simplifies this code and also issues the optimal number of writes * (based on which copies actually read bad data, as opposed to which we * think might be wrong). For the same reason, we always use * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start(). */ static void vdev_indirect_repair(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; enum zio_flag flags = ZIO_FLAG_IO_REPAIR; if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) flags |= ZIO_FLAG_SELF_HEAL; if (!spa_writeable(zio->io_spa)) return; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic == is->is_good_child) continue; if (ic->ic_data == NULL) continue; if (ic->ic_duplicate == is->is_good_child) continue; zio_nowait(zio_vdev_child_io(zio, NULL, ic->ic_vdev, is->is_target_offset, is->is_good_child->ic_data, is->is_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, NULL, NULL)); vdev_indirect_checksum_error(zio, is, ic); } } } /* * Report checksum errors on all children that we read from. */ static void vdev_indirect_all_checksum_errors(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; if (zio->io_flags & ZIO_FLAG_SPECULATIVE) return; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic->ic_data == NULL) continue; vdev_t *vd = ic->ic_vdev; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); - zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, - is->is_target_offset, is->is_size, + (void) zfs_ereport_post_checksum(zio->io_spa, vd, + NULL, zio, is->is_target_offset, is->is_size, NULL, NULL, NULL); } } } /* * Copy data from all the splits to a main zio then validate the checksum. * If then checksum is successfully validated return success. */ static int vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio) { zio_bad_cksum_t zbc; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { ASSERT3P(is->is_good_child->ic_data, !=, NULL); ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL); abd_copy_off(zio->io_abd, is->is_good_child->ic_data, is->is_split_offset, 0, is->is_size); } return (zio_checksum_error(zio, &zbc)); } /* * There are relatively few possible combinations making it feasible to * deterministically check them all. We do this by setting the good_child * to the next unique split version. If we reach the end of the list then * "carry over" to the next unique split version (like counting in base * is_unique_children, but each digit can have a different base). */ static int vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio) { boolean_t more = B_TRUE; iv->iv_attempts = 0; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) is->is_good_child = list_head(&is->is_unique_child); while (more == B_TRUE) { iv->iv_attempts++; more = B_FALSE; if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) return (0); for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { is->is_good_child = list_next(&is->is_unique_child, is->is_good_child); if (is->is_good_child != NULL) { more = B_TRUE; break; } is->is_good_child = list_head(&is->is_unique_child); } } ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations); return (SET_ERROR(ECKSUM)); } /* * There are too many combinations to try all of them in a reasonable amount * of time. So try a fixed number of random combinations from the unique * split versions, after which we'll consider the block unrecoverable. */ static int vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio) { iv->iv_attempts = 0; while (iv->iv_attempts < iv->iv_attempts_max) { iv->iv_attempts++; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { indirect_child_t *ic = list_head(&is->is_unique_child); int children = is->is_unique_children; for (int i = spa_get_random(children); i > 0; i--) ic = list_next(&is->is_unique_child, ic); ASSERT3P(ic, !=, NULL); is->is_good_child = ic; } if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) return (0); } return (SET_ERROR(ECKSUM)); } /* * This is a validation function for reconstruction. It randomly selects * a good combination, if one can be found, and then it intentionally * damages all other segment copes by zeroing them. This forces the * reconstruction algorithm to locate the one remaining known good copy. */ static int vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio) { int error; /* Presume all the copies are unique for initial selection. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { is->is_unique_children = 0; for (int i = 0; i < is->is_children; i++) { indirect_child_t *ic = &is->is_child[i]; if (ic->ic_data != NULL) { is->is_unique_children++; list_insert_tail(&is->is_unique_child, ic); } } if (list_is_empty(&is->is_unique_child)) { error = SET_ERROR(EIO); goto out; } } /* * Set each is_good_child to a randomly-selected child which * is known to contain validated data. */ error = vdev_indirect_splits_enumerate_randomly(iv, zio); if (error) goto out; /* * Damage all but the known good copy by zeroing it. This will * result in two or less unique copies per indirect_child_t. * Both may need to be checked in order to reconstruct the block. * Set iv->iv_attempts_max such that all unique combinations will * enumerated, but limit the damage to at most 12 indirect splits. */ iv->iv_attempts_max = 1; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic == is->is_good_child) continue; if (ic->ic_data == NULL) continue; abd_zero(ic->ic_data, abd_get_size(ic->ic_data)); } iv->iv_attempts_max *= 2; if (iv->iv_attempts_max >= (1ULL << 12)) { iv->iv_attempts_max = UINT64_MAX; break; } } out: /* Empty the unique children lists so they can be reconstructed. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { indirect_child_t *ic; while ((ic = list_head(&is->is_unique_child)) != NULL) list_remove(&is->is_unique_child, ic); is->is_unique_children = 0; } return (error); } /* * This function is called when we have read all copies of the data and need * to try to find a combination of copies that gives us the right checksum. * * If we pointed to any mirror vdevs, this effectively does the job of the * mirror. The mirror vdev code can't do its own job because we don't know * the checksum of each split segment individually. * * We have to try every unique combination of copies of split segments, until * we find one that checksums correctly. Duplicate segment copies are first * identified and latter skipped during reconstruction. This optimization * reduces the search space and ensures that of the remaining combinations * at most one is correct. * * When the total number of combinations is small they can all be checked. * For example, if we have 3 segments in the split, and each points to a * 2-way mirror with unique copies, we will have the following pieces of data: * * | mirror child * split | [0] [1] * ======|===================== * A | data_A_0 data_A_1 * B | data_B_0 data_B_1 * C | data_C_0 data_C_1 * * We will try the following (mirror children)^(number of splits) (2^3=8) * combinations, which is similar to bitwise-little-endian counting in * binary. In general each "digit" corresponds to a split segment, and the * base of each digit is is_children, which can be different for each * digit. * * "low bit" "high bit" * v v * data_A_0 data_B_0 data_C_0 * data_A_1 data_B_0 data_C_0 * data_A_0 data_B_1 data_C_0 * data_A_1 data_B_1 data_C_0 * data_A_0 data_B_0 data_C_1 * data_A_1 data_B_0 data_C_1 * data_A_0 data_B_1 data_C_1 * data_A_1 data_B_1 data_C_1 * * Note that the split segments may be on the same or different top-level * vdevs. In either case, we may need to try lots of combinations (see * zfs_reconstruct_indirect_combinations_max). This ensures that if a mirror * has small silent errors on all of its children, we can still reconstruct * the correct data, as long as those errors are at sufficiently-separated * offsets (specifically, separated by the largest block size - default of * 128KB, but up to 16MB). */ static void vdev_indirect_reconstruct_io_done(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; boolean_t known_good = B_FALSE; int error; iv->iv_unique_combinations = 1; iv->iv_attempts_max = UINT64_MAX; if (zfs_reconstruct_indirect_combinations_max > 0) iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max; /* * If nonzero, every 1/x blocks will be damaged, in order to validate * reconstruction when there are split segments with damaged copies. * Known_good will be TRUE when reconstruction is known to be possible. */ if (zfs_reconstruct_indirect_damage_fraction != 0 && spa_get_random(zfs_reconstruct_indirect_damage_fraction) == 0) known_good = (vdev_indirect_splits_damage(iv, zio) == 0); /* * Determine the unique children for a split segment and add them * to the is_unique_child list. By restricting reconstruction * to these children, only unique combinations will be considered. * This can vastly reduce the search space when there are a large * number of indirect splits. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { is->is_unique_children = 0; for (int i = 0; i < is->is_children; i++) { indirect_child_t *ic_i = &is->is_child[i]; if (ic_i->ic_data == NULL || ic_i->ic_duplicate != NULL) continue; for (int j = i + 1; j < is->is_children; j++) { indirect_child_t *ic_j = &is->is_child[j]; if (ic_j->ic_data == NULL || ic_j->ic_duplicate != NULL) continue; if (abd_cmp(ic_i->ic_data, ic_j->ic_data) == 0) ic_j->ic_duplicate = ic_i; } is->is_unique_children++; list_insert_tail(&is->is_unique_child, ic_i); } /* Reconstruction is impossible, no valid children */ EQUIV(list_is_empty(&is->is_unique_child), is->is_unique_children == 0); if (list_is_empty(&is->is_unique_child)) { zio->io_error = EIO; vdev_indirect_all_checksum_errors(zio); zio_checksum_verified(zio); return; } iv->iv_unique_combinations *= is->is_unique_children; } if (iv->iv_unique_combinations <= iv->iv_attempts_max) error = vdev_indirect_splits_enumerate_all(iv, zio); else error = vdev_indirect_splits_enumerate_randomly(iv, zio); if (error != 0) { /* All attempted combinations failed. */ ASSERT3B(known_good, ==, B_FALSE); zio->io_error = error; vdev_indirect_all_checksum_errors(zio); } else { /* * The checksum has been successfully validated. Issue * repair I/Os to any copies of splits which don't match * the validated version. */ ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio)); vdev_indirect_repair(zio); zio_checksum_verified(zio); } } static void vdev_indirect_io_done(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; if (iv->iv_reconstruct) { /* * We have read all copies of the data (e.g. from mirrors), * either because this was a scrub/resilver, or because the * one-copy read didn't checksum correctly. */ vdev_indirect_reconstruct_io_done(zio); return; } if (!iv->iv_split_block) { /* * This was not a split block, so we passed the BP down, * and the checksum was handled by the (one) child zio. */ return; } zio_bad_cksum_t zbc; int ret = zio_checksum_error(zio, &zbc); if (ret == 0) { zio_checksum_verified(zio); return; } /* * The checksum didn't match. Read all copies of all splits, and * then we will try to reconstruct. The next time * vdev_indirect_io_done() is called, iv_reconstruct will be set. */ vdev_indirect_read_all(zio); zio_vdev_io_redone(zio); } vdev_ops_t vdev_indirect_ops = { .vdev_op_open = vdev_indirect_open, .vdev_op_close = vdev_indirect_close, .vdev_op_asize = vdev_default_asize, .vdev_op_io_start = vdev_indirect_io_start, .vdev_op_io_done = vdev_indirect_io_done, .vdev_op_state_change = NULL, .vdev_op_need_resilver = NULL, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = vdev_indirect_remap, .vdev_op_xlate = NULL, .vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* leaf vdev */ }; EXPORT_SYMBOL(spa_condense_fini); EXPORT_SYMBOL(spa_start_indirect_condensing_thread); EXPORT_SYMBOL(spa_condense_indirect_start_sync); EXPORT_SYMBOL(spa_condense_init); EXPORT_SYMBOL(spa_vdev_indirect_mark_obsolete); EXPORT_SYMBOL(vdev_indirect_mark_obsolete); EXPORT_SYMBOL(vdev_indirect_should_condense); EXPORT_SYMBOL(vdev_indirect_sync_obsolete); EXPORT_SYMBOL(vdev_obsolete_counts_are_precise); EXPORT_SYMBOL(vdev_obsolete_sm_object); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT, ZMOD_RW, "Whether to attempt condensing indirect vdev mappings"); ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, ULONG, ZMOD_RW, "Don't bother condensing if the mapping uses less than this amount of " "memory"); ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, ULONG, ZMOD_RW, "Minimum size obsolete spacemap to attempt condensing"); ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms, INT, ZMOD_RW, "Used by tests to ensure certain actions happen in the middle of a " "condense. A maximum value of 1 should be sufficient."); ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max, INT, ZMOD_RW, "Maximum number of combinations when reconstructing split segments"); /* END CSTYLED */ Index: vendor-sys/openzfs/dist/module/zfs/vdev_raidz.c =================================================================== --- vendor-sys/openzfs/dist/module/zfs/vdev_raidz.c (revision 365339) +++ vendor-sys/openzfs/dist/module/zfs/vdev_raidz.c (revision 365340) @@ -1,2421 +1,2421 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2016 Gvozden Nešković. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #ifdef ZFS_DEBUG #include /* For vdev_xlate() in vdev_raidz_io_verify() */ #endif /* * Virtual device vector for RAID-Z. * * This vdev supports single, double, and triple parity. For single parity, * we use a simple XOR of all the data columns. For double or triple parity, * we use a special case of Reed-Solomon coding. This extends the * technique described in "The mathematics of RAID-6" by H. Peter Anvin by * drawing on the system described in "A Tutorial on Reed-Solomon Coding for * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the * former is also based. The latter is designed to provide higher performance * for writes. * * Note that the Plank paper claimed to support arbitrary N+M, but was then * amended six years later identifying a critical flaw that invalidates its * claims. Nevertheless, the technique can be adapted to work for up to * triple parity. For additional parity, the amendment "Note: Correction to * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding * is viable, but the additional complexity means that write performance will * suffer. * * All of the methods above operate on a Galois field, defined over the * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements * can be expressed with a single byte. Briefly, the operations on the * field are defined as follows: * * o addition (+) is represented by a bitwise XOR * o subtraction (-) is therefore identical to addition: A + B = A - B * o multiplication of A by 2 is defined by the following bitwise expression: * * (A * 2)_7 = A_6 * (A * 2)_6 = A_5 * (A * 2)_5 = A_4 * (A * 2)_4 = A_3 + A_7 * (A * 2)_3 = A_2 + A_7 * (A * 2)_2 = A_1 + A_7 * (A * 2)_1 = A_0 * (A * 2)_0 = A_7 * * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). * As an aside, this multiplication is derived from the error correcting * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. * * Observe that any number in the field (except for 0) can be expressed as a * power of 2 -- a generator for the field. We store a table of the powers of * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather * than field addition). The inverse of a field element A (A^-1) is therefore * A ^ (255 - 1) = A^254. * * The up-to-three parity columns, P, Q, R over several data columns, * D_0, ... D_n-1, can be expressed by field operations: * * P = D_0 + D_1 + ... + D_n-2 + D_n-1 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 * * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial * XOR operation, and 2 and 4 can be computed quickly and generate linearly- * independent coefficients. (There are no additional coefficients that have * this property which is why the uncorrected Plank method breaks down.) * * See the reconstruction code below for how P, Q and R can used individually * or in concert to recover missing data columns. */ #define VDEV_RAIDZ_P 0 #define VDEV_RAIDZ_Q 1 #define VDEV_RAIDZ_R 2 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) /* * We provide a mechanism to perform the field multiplication operation on a * 64-bit value all at once rather than a byte at a time. This works by * creating a mask from the top bit in each byte and using that to * conditionally apply the XOR of 0x1d. */ #define VDEV_RAIDZ_64MUL_2(x, mask) \ { \ (mask) = (x) & 0x8080808080808080ULL; \ (mask) = ((mask) << 1) - ((mask) >> 7); \ (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ } #define VDEV_RAIDZ_64MUL_4(x, mask) \ { \ VDEV_RAIDZ_64MUL_2((x), mask); \ VDEV_RAIDZ_64MUL_2((x), mask); \ } void vdev_raidz_map_free(raidz_map_t *rm) { int c; for (c = 0; c < rm->rm_firstdatacol; c++) { abd_free(rm->rm_col[c].rc_abd); if (rm->rm_col[c].rc_gdata != NULL) abd_free(rm->rm_col[c].rc_gdata); } for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) abd_put(rm->rm_col[c].rc_abd); if (rm->rm_abd_copy != NULL) abd_free(rm->rm_abd_copy); kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); } static void vdev_raidz_map_free_vsd(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; ASSERT0(rm->rm_freed); rm->rm_freed = 1; if (rm->rm_reports == 0) vdev_raidz_map_free(rm); } /*ARGSUSED*/ static void vdev_raidz_cksum_free(void *arg, size_t ignored) { raidz_map_t *rm = arg; ASSERT3U(rm->rm_reports, >, 0); if (--rm->rm_reports == 0 && rm->rm_freed != 0) vdev_raidz_map_free(rm); } static void vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) { raidz_map_t *rm = zcr->zcr_cbdata; const size_t c = zcr->zcr_cbinfo; size_t x, offset; const abd_t *good = NULL; const abd_t *bad = rm->rm_col[c].rc_abd; if (good_data == NULL) { zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); return; } if (c < rm->rm_firstdatacol) { /* * The first time through, calculate the parity blocks for * the good data (this relies on the fact that the good * data never changes for a given logical ZIO) */ if (rm->rm_col[0].rc_gdata == NULL) { abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; /* * Set up the rm_col[]s to generate the parity for * good_data, first saving the parity bufs and * replacing them with buffers to hold the result. */ for (x = 0; x < rm->rm_firstdatacol; x++) { bad_parity[x] = rm->rm_col[x].rc_abd; rm->rm_col[x].rc_abd = rm->rm_col[x].rc_gdata = abd_alloc_sametype(rm->rm_col[x].rc_abd, rm->rm_col[x].rc_size); } /* fill in the data columns from good_data */ offset = 0; for (; x < rm->rm_cols; x++) { abd_put(rm->rm_col[x].rc_abd); rm->rm_col[x].rc_abd = abd_get_offset_size((abd_t *)good_data, offset, rm->rm_col[x].rc_size); offset += rm->rm_col[x].rc_size; } /* * Construct the parity from the good data. */ vdev_raidz_generate_parity(rm); /* restore everything back to its original state */ for (x = 0; x < rm->rm_firstdatacol; x++) rm->rm_col[x].rc_abd = bad_parity[x]; offset = 0; for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { abd_put(rm->rm_col[x].rc_abd); rm->rm_col[x].rc_abd = abd_get_offset_size( rm->rm_abd_copy, offset, rm->rm_col[x].rc_size); offset += rm->rm_col[x].rc_size; } } ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0, rm->rm_col[c].rc_size); } else { /* adjust good_data to point at the start of our column */ offset = 0; for (x = rm->rm_firstdatacol; x < c; x++) offset += rm->rm_col[x].rc_size; good = abd_get_offset_size((abd_t *)good_data, offset, rm->rm_col[c].rc_size); } /* we drop the ereport if it ends up that the data was good */ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); abd_put((abd_t *)good); } /* * Invoked indirectly by zfs_ereport_start_checksum(), called * below when our read operation fails completely. The main point * is to keep a copy of everything we read from disk, so that at * vdev_raidz_cksum_finish() time we can compare it with the good data. */ static void vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) { size_t c = (size_t)(uintptr_t)arg; size_t offset; raidz_map_t *rm = zio->io_vsd; size_t size; /* set up the report and bump the refcount */ zcr->zcr_cbdata = rm; zcr->zcr_cbinfo = c; zcr->zcr_finish = vdev_raidz_cksum_finish; zcr->zcr_free = vdev_raidz_cksum_free; rm->rm_reports++; ASSERT3U(rm->rm_reports, >, 0); if (rm->rm_abd_copy != NULL) return; /* * It's the first time we're called for this raidz_map_t, so we need * to copy the data aside; there's no guarantee that our zio's buffer * won't be re-used for something else. * * Our parity data is already in separate buffers, so there's no need * to copy them. */ size = 0; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) size += rm->rm_col[c].rc_size; rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE); for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { raidz_col_t *col = &rm->rm_col[c]; abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset, col->rc_size); abd_copy(tmp, col->rc_abd, col->rc_size); abd_put(col->rc_abd); col->rc_abd = tmp; offset += col->rc_size; } ASSERT3U(offset, ==, size); } static const zio_vsd_ops_t vdev_raidz_vsd_ops = { .vsd_free = vdev_raidz_map_free_vsd, .vsd_cksum_report = vdev_raidz_cksum_report }; /* * Divides the IO evenly across all child vdevs; usually, dcols is * the number of children in the target vdev. * * Avoid inlining the function to keep vdev_raidz_io_start(), which * is this functions only caller, as small as possible on the stack. */ noinline raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, uint64_t nparity) { raidz_map_t *rm; /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = zio->io_offset >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = zio->io_size >> ashift; /* The first column for this stripe. */ uint64_t f = b % dcols; /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << ashift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; uint64_t off = 0; /* * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. */ q = s / (dcols - nparity); /* * "Remainder": The number of partial stripe data sectors in this I/O. * This will add a sector to some, but not all, child vdevs. */ r = s - q * (dcols - nparity); /* The number of "big columns" - those which contain remainder data. */ bc = (r == 0 ? 0 : r + nparity); /* * The total number of data and parity sectors associated with * this I/O. */ tot = s + nparity * (q + (r == 0 ? 0 : 1)); /* acols: The columns that will be accessed. */ /* scols: The columns that will be accessed or skipped. */ if (q == 0) { /* Our I/O request doesn't span all child vdevs. */ acols = bc; scols = MIN(dcols, roundup(bc, nparity + 1)); } else { acols = dcols; scols = dcols; } ASSERT3U(acols, <=, scols); rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); rm->rm_cols = acols; rm->rm_scols = scols; rm->rm_bigcols = bc; rm->rm_skipstart = bc; rm->rm_missingdata = 0; rm->rm_missingparity = 0; rm->rm_firstdatacol = nparity; rm->rm_abd_copy = NULL; rm->rm_reports = 0; rm->rm_freed = 0; rm->rm_ecksuminjected = 0; asize = 0; for (c = 0; c < scols; c++) { col = f + c; coff = o; if (col >= dcols) { col -= dcols; coff += 1ULL << ashift; } rm->rm_col[c].rc_devidx = col; rm->rm_col[c].rc_offset = coff; rm->rm_col[c].rc_abd = NULL; rm->rm_col[c].rc_gdata = NULL; rm->rm_col[c].rc_error = 0; rm->rm_col[c].rc_tried = 0; rm->rm_col[c].rc_skipped = 0; if (c >= acols) rm->rm_col[c].rc_size = 0; else if (c < bc) rm->rm_col[c].rc_size = (q + 1) << ashift; else rm->rm_col[c].rc_size = q << ashift; asize += rm->rm_col[c].rc_size; } ASSERT3U(asize, ==, tot << ashift); rm->rm_asize = roundup(asize, (nparity + 1) << ashift); rm->rm_nskip = roundup(tot, nparity + 1) - tot; ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift); ASSERT3U(rm->rm_nskip, <=, nparity); for (c = 0; c < rm->rm_firstdatacol; c++) rm->rm_col[c].rc_abd = abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, rm->rm_col[c].rc_size); off = rm->rm_col[c].rc_size; for (c = c + 1; c < acols; c++) { rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off, rm->rm_col[c].rc_size); off += rm->rm_col[c].rc_size; } /* * If all data stored spans all columns, there's a danger that parity * will always be on the same device and, since parity isn't read * during normal operation, that device's I/O bandwidth won't be * used effectively. We therefore switch the parity every 1MB. * * ... at least that was, ostensibly, the theory. As a practical * matter unless we juggle the parity between all devices evenly, we * won't see any benefit. Further, occasional writes that aren't a * multiple of the LCM of the number of children and the minimum * stripe width are sufficient to avoid pessimal behavior. * Unfortunately, this decision created an implicit on-disk format * requirement that we need to support for all eternity, but only * for single-parity RAID-Z. * * If we intend to skip a sector in the zeroth column for padding * we must make sure to note this swap. We will never intend to * skip the first column since at least one data and one parity * column must appear in each row. */ ASSERT(rm->rm_cols >= 2); ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { devidx = rm->rm_col[0].rc_devidx; o = rm->rm_col[0].rc_offset; rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; rm->rm_col[1].rc_devidx = devidx; rm->rm_col[1].rc_offset = o; if (rm->rm_skipstart == 0) rm->rm_skipstart = 1; } zio->io_vsd = rm; zio->io_vsd_ops = &vdev_raidz_vsd_ops; /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); return (rm); } struct pqr_struct { uint64_t *p; uint64_t *q; uint64_t *r; }; static int vdev_raidz_p_func(void *buf, size_t size, void *private) { struct pqr_struct *pqr = private; const uint64_t *src = buf; int i, cnt = size / sizeof (src[0]); ASSERT(pqr->p && !pqr->q && !pqr->r); for (i = 0; i < cnt; i++, src++, pqr->p++) *pqr->p ^= *src; return (0); } static int vdev_raidz_pq_func(void *buf, size_t size, void *private) { struct pqr_struct *pqr = private; const uint64_t *src = buf; uint64_t mask; int i, cnt = size / sizeof (src[0]); ASSERT(pqr->p && pqr->q && !pqr->r); for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { *pqr->p ^= *src; VDEV_RAIDZ_64MUL_2(*pqr->q, mask); *pqr->q ^= *src; } return (0); } static int vdev_raidz_pqr_func(void *buf, size_t size, void *private) { struct pqr_struct *pqr = private; const uint64_t *src = buf; uint64_t mask; int i, cnt = size / sizeof (src[0]); ASSERT(pqr->p && pqr->q && pqr->r); for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { *pqr->p ^= *src; VDEV_RAIDZ_64MUL_2(*pqr->q, mask); *pqr->q ^= *src; VDEV_RAIDZ_64MUL_4(*pqr->r, mask); *pqr->r ^= *src; } return (0); } static void vdev_raidz_generate_parity_p(raidz_map_t *rm) { uint64_t *p; int c; abd_t *src; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_abd; p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); if (c == rm->rm_firstdatacol) { abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); } else { struct pqr_struct pqr = { p, NULL, NULL }; (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, vdev_raidz_p_func, &pqr); } } } static void vdev_raidz_generate_parity_pq(raidz_map_t *rm) { uint64_t *p, *q, pcnt, ccnt, mask, i; int c; abd_t *src; pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_abd; p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); if (c == rm->rm_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); (void) memcpy(q, p, rm->rm_col[c].rc_size); for (i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; } } else { struct pqr_struct pqr = { p, q, NULL }; ASSERT(ccnt <= pcnt); (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, vdev_raidz_pq_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ for (i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); } } } } static void vdev_raidz_generate_parity_pqr(raidz_map_t *rm) { uint64_t *p, *q, *r, pcnt, ccnt, mask, i; int c; abd_t *src; pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_R].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_abd; p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); if (c == rm->rm_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); (void) memcpy(q, p, rm->rm_col[c].rc_size); (void) memcpy(r, p, rm->rm_col[c].rc_size); for (i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; r[i] = 0; } } else { struct pqr_struct pqr = { p, q, r }; ASSERT(ccnt <= pcnt); (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, vdev_raidz_pqr_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ for (i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); VDEV_RAIDZ_64MUL_4(r[i], mask); } } } } /* * Generate RAID parity in the first virtual columns according to the number of * parity columns available. */ void vdev_raidz_generate_parity(raidz_map_t *rm) { /* Generate using the new math implementation */ if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL) return; switch (rm->rm_firstdatacol) { case 1: vdev_raidz_generate_parity_p(rm); break; case 2: vdev_raidz_generate_parity_pq(rm); break; case 3: vdev_raidz_generate_parity_pqr(rm); break; default: cmn_err(CE_PANIC, "invalid RAID-Z configuration"); } } /* ARGSUSED */ static int vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) { uint64_t *dst = dbuf; uint64_t *src = sbuf; int cnt = size / sizeof (src[0]); for (int i = 0; i < cnt; i++) { dst[i] ^= src[i]; } return (0); } /* ARGSUSED */ static int vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, void *private) { uint64_t *dst = dbuf; uint64_t *src = sbuf; uint64_t mask; int cnt = size / sizeof (dst[0]); for (int i = 0; i < cnt; i++, dst++, src++) { VDEV_RAIDZ_64MUL_2(*dst, mask); *dst ^= *src; } return (0); } /* ARGSUSED */ static int vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) { uint64_t *dst = buf; uint64_t mask; int cnt = size / sizeof (dst[0]); for (int i = 0; i < cnt; i++, dst++) { /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ VDEV_RAIDZ_64MUL_2(*dst, mask); } return (0); } struct reconst_q_struct { uint64_t *q; int exp; }; static int vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) { struct reconst_q_struct *rq = private; uint64_t *dst = buf; int cnt = size / sizeof (dst[0]); for (int i = 0; i < cnt; i++, dst++, rq->q++) { int j; uint8_t *b; *dst ^= *rq->q; for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { *b = vdev_raidz_exp2(*b, rq->exp); } } return (0); } struct reconst_pq_struct { uint8_t *p; uint8_t *q; uint8_t *pxy; uint8_t *qxy; int aexp; int bexp; }; static int vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) { struct reconst_pq_struct *rpq = private; uint8_t *xd = xbuf; uint8_t *yd = ybuf; for (int i = 0; i < size; i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); *yd = *rpq->p ^ *rpq->pxy ^ *xd; } return (0); } static int vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) { struct reconst_pq_struct *rpq = private; uint8_t *xd = xbuf; for (int i = 0; i < size; i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { /* same operation as vdev_raidz_reconst_pq_func() on xd */ *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); } return (0); } static int vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) { int x = tgts[0]; int c; abd_t *dst, *src; ASSERT(ntgts == 1); ASSERT(x >= rm->rm_firstdatacol); ASSERT(x < rm->rm_cols); ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); ASSERT(rm->rm_col[x].rc_size > 0); src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; dst = rm->rm_col[x].rc_abd; abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { uint64_t size = MIN(rm->rm_col[x].rc_size, rm->rm_col[c].rc_size); src = rm->rm_col[c].rc_abd; dst = rm->rm_col[x].rc_abd; if (c == x) continue; (void) abd_iterate_func2(dst, src, 0, 0, size, vdev_raidz_reconst_p_func, NULL); } return (1 << VDEV_RAIDZ_P); } static int vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) { int x = tgts[0]; int c, exp; abd_t *dst, *src; ASSERT(ntgts == 1); ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, rm->rm_col[c].rc_size); src = rm->rm_col[c].rc_abd; dst = rm->rm_col[x].rc_abd; if (c == rm->rm_firstdatacol) { abd_copy(dst, src, size); if (rm->rm_col[x].rc_size > size) abd_zero_off(dst, size, rm->rm_col[x].rc_size - size); } else { ASSERT3U(size, <=, rm->rm_col[x].rc_size); (void) abd_iterate_func2(dst, src, 0, 0, size, vdev_raidz_reconst_q_pre_func, NULL); (void) abd_iterate_func(dst, size, rm->rm_col[x].rc_size - size, vdev_raidz_reconst_q_pre_tail_func, NULL); } } src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; dst = rm->rm_col[x].rc_abd; exp = 255 - (rm->rm_cols - 1 - x); struct reconst_q_struct rq = { abd_to_buf(src), exp }; (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, vdev_raidz_reconst_q_post_func, &rq); return (1 << VDEV_RAIDZ_Q); } static int vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) { uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; abd_t *pdata, *qdata; uint64_t xsize, ysize; int x = tgts[0]; int y = tgts[1]; abd_t *xd, *yd; ASSERT(ntgts == 2); ASSERT(x < y); ASSERT(x >= rm->rm_firstdatacol); ASSERT(y < rm->rm_cols); ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); /* * Move the parity data aside -- we're going to compute parity as * though columns x and y were full of zeros -- Pxy and Qxy. We want to * reuse the parity generation mechanism without trashing the actual * parity so we make those columns appear to be full of zeros by * setting their lengths to zero. */ pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; xsize = rm->rm_col[x].rc_size; ysize = rm->rm_col[y].rc_size; rm->rm_col[VDEV_RAIDZ_P].rc_abd = abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); rm->rm_col[VDEV_RAIDZ_Q].rc_abd = abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); rm->rm_col[x].rc_size = 0; rm->rm_col[y].rc_size = 0; vdev_raidz_generate_parity_pq(rm); rm->rm_col[x].rc_size = xsize; rm->rm_col[y].rc_size = ysize; p = abd_to_buf(pdata); q = abd_to_buf(qdata); pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); xd = rm->rm_col[x].rc_abd; yd = rm->rm_col[y].rc_abd; /* * We now have: * Pxy = P + D_x + D_y * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y * * We can then solve for D_x: * D_x = A * (P + Pxy) + B * (Q + Qxy) * where * A = 2^(x - y) * (2^(x - y) + 1)^-1 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 * * With D_x in hand, we can easily solve for D_y: * D_y = P + Pxy + D_x */ a = vdev_raidz_pow2[255 + x - y]; b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; tmp = 255 - vdev_raidz_log2[a ^ 1]; aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; ASSERT3U(xsize, >=, ysize); struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; (void) abd_iterate_func2(xd, yd, 0, 0, ysize, vdev_raidz_reconst_pq_func, &rpq); (void) abd_iterate_func(xd, ysize, xsize - ysize, vdev_raidz_reconst_pq_tail_func, &rpq); abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); /* * Restore the saved parity data. */ rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); } /* BEGIN CSTYLED */ /* * In the general case of reconstruction, we must solve the system of linear * equations defined by the coefficients used to generate parity as well as * the contents of the data and parity disks. This can be expressed with * vectors for the original data (D) and the actual data (d) and parity (p) * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): * * __ __ __ __ * | | __ __ | p_0 | * | V | | D_0 | | p_m-1 | * | | x | : | = | d_0 | * | I | | D_n-1 | | : | * | | ~~ ~~ | d_n-1 | * ~~ ~~ ~~ ~~ * * I is simply a square identity matrix of size n, and V is a vandermonde * matrix defined by the coefficients we chose for the various parity columns * (1, 2, 4). Note that these values were chosen both for simplicity, speedy * computation as well as linear separability. * * __ __ __ __ * | 1 .. 1 1 1 | | p_0 | * | 2^n-1 .. 4 2 1 | __ __ | : | * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | * | 1 .. 0 0 0 | | D_1 | | d_0 | * | 0 .. 0 0 0 | x | D_2 | = | d_1 | * | : : : : | | : | | d_2 | * | 0 .. 1 0 0 | | D_n-1 | | : | * | 0 .. 0 1 0 | ~~ ~~ | : | * | 0 .. 0 0 1 | | d_n-1 | * ~~ ~~ ~~ ~~ * * Note that I, V, d, and p are known. To compute D, we must invert the * matrix and use the known data and parity values to reconstruct the unknown * data values. We begin by removing the rows in V|I and d|p that correspond * to failed or missing columns; we then make V|I square (n x n) and d|p * sized n by removing rows corresponding to unused parity from the bottom up * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' * using Gauss-Jordan elimination. In the example below we use m=3 parity * columns, n=8 data columns, with errors in d_1, d_2, and p_1: * __ __ * | 1 1 1 1 1 1 1 1 | * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks * | 19 205 116 29 64 16 4 1 | / / * | 1 0 0 0 0 0 0 0 | / / * | 0 1 0 0 0 0 0 0 | <--' / * (V|I) = | 0 0 1 0 0 0 0 0 | <---' * | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 1 1 1 1 1 1 1 | * | 128 64 32 16 8 4 2 1 | * | 19 205 116 29 64 16 4 1 | * | 1 0 0 0 0 0 0 0 | * | 0 1 0 0 0 0 0 0 | * (V|I)' = | 0 0 1 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We * have carefully chosen the seed values 1, 2, and 4 to ensure that this * matrix is not singular. * __ __ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 0 0 1 0 0 0 0 0 | * | 167 100 5 41 159 169 217 208 | * | 166 100 4 40 158 168 216 209 | * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values * of the missing data. * * As is apparent from the example above, the only non-trivial rows in the * inverse matrix correspond to the data disks that we're trying to * reconstruct. Indeed, those are the only rows we need as the others would * only be useful for reconstructing data known or assumed to be valid. For * that reason, we only build the coefficients in the rows that correspond to * targeted columns. */ /* END CSTYLED */ static void vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, uint8_t **rows) { int i, j; int pow; ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); /* * Fill in the missing rows of interest. */ for (i = 0; i < nmap; i++) { ASSERT3S(0, <=, map[i]); ASSERT3S(map[i], <=, 2); pow = map[i] * n; if (pow > 255) pow -= 255; ASSERT(pow <= 255); for (j = 0; j < n; j++) { pow -= map[i]; if (pow < 0) pow += 255; rows[i][j] = vdev_raidz_pow2[pow]; } } } static void vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, uint8_t **rows, uint8_t **invrows, const uint8_t *used) { int i, j, ii, jj; uint8_t log; /* * Assert that the first nmissing entries from the array of used * columns correspond to parity columns and that subsequent entries * correspond to data columns. */ for (i = 0; i < nmissing; i++) { ASSERT3S(used[i], <, rm->rm_firstdatacol); } for (; i < n; i++) { ASSERT3S(used[i], >=, rm->rm_firstdatacol); } /* * First initialize the storage where we'll compute the inverse rows. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { invrows[i][j] = (i == j) ? 1 : 0; } } /* * Subtract all trivial rows from the rows of consequence. */ for (i = 0; i < nmissing; i++) { for (j = nmissing; j < n; j++) { ASSERT3U(used[j], >=, rm->rm_firstdatacol); jj = used[j] - rm->rm_firstdatacol; ASSERT3S(jj, <, n); invrows[i][j] = rows[i][jj]; rows[i][jj] = 0; } } /* * For each of the rows of interest, we must normalize it and subtract * a multiple of it from the other rows. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < missing[i]; j++) { ASSERT0(rows[i][j]); } ASSERT3U(rows[i][missing[i]], !=, 0); /* * Compute the inverse of the first element and multiply each * element in the row by that value. */ log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; for (j = 0; j < n; j++) { rows[i][j] = vdev_raidz_exp2(rows[i][j], log); invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); } for (ii = 0; ii < nmissing; ii++) { if (i == ii) continue; ASSERT3U(rows[ii][missing[i]], !=, 0); log = vdev_raidz_log2[rows[ii][missing[i]]]; for (j = 0; j < n; j++) { rows[ii][j] ^= vdev_raidz_exp2(rows[i][j], log); invrows[ii][j] ^= vdev_raidz_exp2(invrows[i][j], log); } } } /* * Verify that the data that is left in the rows are properly part of * an identity matrix. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { if (j == missing[i]) { ASSERT3U(rows[i][j], ==, 1); } else { ASSERT0(rows[i][j]); } } } } static void vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, int *missing, uint8_t **invrows, const uint8_t *used) { int i, j, x, cc, c; uint8_t *src; uint64_t ccount; uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; uint8_t log = 0; uint8_t val; int ll; uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; size_t psize; psize = sizeof (invlog[0][0]) * n * nmissing; p = kmem_alloc(psize, KM_SLEEP); for (pp = p, i = 0; i < nmissing; i++) { invlog[i] = pp; pp += n; } for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { ASSERT3U(invrows[i][j], !=, 0); invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; } } for (i = 0; i < n; i++) { c = used[i]; ASSERT3U(c, <, rm->rm_cols); src = abd_to_buf(rm->rm_col[c].rc_abd); ccount = rm->rm_col[c].rc_size; for (j = 0; j < nmissing; j++) { cc = missing[j] + rm->rm_firstdatacol; ASSERT3U(cc, >=, rm->rm_firstdatacol); ASSERT3U(cc, <, rm->rm_cols); ASSERT3U(cc, !=, c); dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); dcount[j] = rm->rm_col[cc].rc_size; } ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); for (x = 0; x < ccount; x++, src++) { if (*src != 0) log = vdev_raidz_log2[*src]; for (cc = 0; cc < nmissing; cc++) { if (x >= dcount[cc]) continue; if (*src == 0) { val = 0; } else { if ((ll = log + invlog[cc][i]) >= 255) ll -= 255; val = vdev_raidz_pow2[ll]; } if (i == 0) dst[cc][x] = val; else dst[cc][x] ^= val; } } } kmem_free(p, psize); } static int vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) { int n, i, c, t, tt; int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; size_t psize; uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; abd_t **bufs = NULL; int code = 0; /* * Matrix reconstruction can't use scatter ABDs yet, so we allocate * temporary linear ABDs. */ if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { raidz_col_t *col = &rm->rm_col[c]; bufs[c] = col->rc_abd; col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); abd_copy(col->rc_abd, bufs[c], col->rc_size); } } n = rm->rm_cols - rm->rm_firstdatacol; /* * Figure out which data columns are missing. */ nmissing_rows = 0; for (t = 0; t < ntgts; t++) { if (tgts[t] >= rm->rm_firstdatacol) { missing_rows[nmissing_rows++] = tgts[t] - rm->rm_firstdatacol; } } /* * Figure out which parity columns to use to help generate the missing * data columns. */ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { ASSERT(tt < ntgts); ASSERT(c < rm->rm_firstdatacol); /* * Skip any targeted parity columns. */ if (c == tgts[tt]) { tt++; continue; } code |= 1 << c; parity_map[i] = c; i++; } ASSERT(code != 0); ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * nmissing_rows * n + sizeof (used[0]) * n; p = kmem_alloc(psize, KM_SLEEP); for (pp = p, i = 0; i < nmissing_rows; i++) { rows[i] = pp; pp += n; invrows[i] = pp; pp += n; } used = pp; for (i = 0; i < nmissing_rows; i++) { used[i] = parity_map[i]; } for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { if (tt < nmissing_rows && c == missing_rows[tt] + rm->rm_firstdatacol) { tt++; continue; } ASSERT3S(i, <, n); used[i] = c; i++; } /* * Initialize the interesting rows of the matrix. */ vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); /* * Invert the matrix. */ vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, invrows, used); /* * Reconstruct the missing data using the generated matrix. */ vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, invrows, used); kmem_free(p, psize); /* * copy back from temporary linear abds and free them */ if (bufs) { for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { raidz_col_t *col = &rm->rm_col[c]; abd_copy(bufs[c], col->rc_abd, col->rc_size); abd_free(col->rc_abd); col->rc_abd = bufs[c]; } kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); } return (code); } int vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY], *dt; int ntgts; int i, c, ret; int code; int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; /* * The tgts list must already be sorted. */ for (i = 1; i < nt; i++) { ASSERT(t[i] > t[i - 1]); } nbadparity = rm->rm_firstdatacol; nbaddata = rm->rm_cols - nbadparity; ntgts = 0; for (i = 0, c = 0; c < rm->rm_cols; c++) { if (c < rm->rm_firstdatacol) parity_valid[c] = B_FALSE; if (i < nt && c == t[i]) { tgts[ntgts++] = c; i++; } else if (rm->rm_col[c].rc_error != 0) { tgts[ntgts++] = c; } else if (c >= rm->rm_firstdatacol) { nbaddata--; } else { parity_valid[c] = B_TRUE; nbadparity--; } } ASSERT(ntgts >= nt); ASSERT(nbaddata >= 0); ASSERT(nbaddata + nbadparity == ntgts); dt = &tgts[nbadparity]; /* Reconstruct using the new math implementation */ ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); if (ret != RAIDZ_ORIGINAL_IMPL) return (ret); /* * See if we can use any of our optimized reconstruction routines. */ switch (nbaddata) { case 1: if (parity_valid[VDEV_RAIDZ_P]) return (vdev_raidz_reconstruct_p(rm, dt, 1)); ASSERT(rm->rm_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_Q]) return (vdev_raidz_reconstruct_q(rm, dt, 1)); ASSERT(rm->rm_firstdatacol > 2); break; case 2: ASSERT(rm->rm_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_P] && parity_valid[VDEV_RAIDZ_Q]) return (vdev_raidz_reconstruct_pq(rm, dt, 2)); ASSERT(rm->rm_firstdatacol > 2); break; } code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); ASSERT(code > 0); return (code); } static int vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_t *cvd; uint64_t nparity = vd->vdev_nparity; int c; int lasterror = 0; int numerrors = 0; ASSERT(nparity > 0); if (nparity > VDEV_RAIDZ_MAXPARITY || vd->vdev_children < nparity + 1) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } vdev_open_children(vd); for (c = 0; c < vd->vdev_children; c++) { cvd = vd->vdev_child[c]; if (cvd->vdev_open_error != 0) { lasterror = cvd->vdev_open_error; numerrors++; continue; } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); *physical_ashift = MAX(*physical_ashift, cvd->vdev_physical_ashift); } *asize *= vd->vdev_children; *max_asize *= vd->vdev_children; if (numerrors > nparity) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; return (lasterror); } return (0); } static void vdev_raidz_close(vdev_t *vd) { int c; for (c = 0; c < vd->vdev_children; c++) vdev_close(vd->vdev_child[c]); } static uint64_t vdev_raidz_asize(vdev_t *vd, uint64_t psize) { uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; uint64_t cols = vd->vdev_children; uint64_t nparity = vd->vdev_nparity; asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); asize = roundup(asize, nparity + 1) << ashift; return (asize); } static void vdev_raidz_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private; rc->rc_error = zio->io_error; rc->rc_tried = 1; rc->rc_skipped = 0; } static void vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) { #ifdef ZFS_DEBUG vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = zio->io_offset; logical_rs.rs_end = logical_rs.rs_start + vdev_raidz_asize(zio->io_vd, zio->io_size); raidz_col_t *rc = &rm->rm_col[col]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; vdev_xlate(cvd, &logical_rs, &physical_rs); ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); /* * It would be nice to assert that rs_end is equal * to rc_offset + rc_size but there might be an * optional I/O at the end that is not accounted in * rc_size. */ if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size + (1 << tvd->vdev_ashift)); } else { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); } #endif } /* * Start an IO operation on a RAIDZ VDev * * Outline: * - For write operations: * 1. Generate the parity data * 2. Create child zio write operations to each column's vdev, for both * data and parity. * 3. If the column skips any sectors for padding, create optional dummy * write zio children for those areas to improve aggregation continuity. * - For read operations: * 1. Create child zio read operations to each data column's vdev to read * the range of data required for zio. * 2. If this is a scrub or resilver operation, or if any of the data * vdevs have had errors, then create zio read operations to the parity * columns' VDevs as well. */ static void vdev_raidz_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; vdev_t *cvd; raidz_map_t *rm; raidz_col_t *rc; int c, i; rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, vd->vdev_nparity); ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); if (zio->io_type == ZIO_TYPE_WRITE) { vdev_raidz_generate_parity(rm); for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; /* * Verify physical to logical translation. */ vdev_raidz_io_verify(zio, rm, c); zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } /* * Generate optional I/Os for any skipped sectors to improve * aggregation contiguity. */ for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { ASSERT(c <= rm->rm_scols); if (c == rm->rm_scols) c = 0; rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset + rc->rc_size, NULL, 1 << tvd->vdev_ashift, zio->io_type, zio->io_priority, ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); } zio_execute(zio); return; } ASSERT(zio->io_type == ZIO_TYPE_READ); /* * Iterate over the columns in reverse order so that we hit the parity * last -- any errors along the way will force us to read the parity. */ for (c = rm->rm_cols - 1; c >= 0; c--) { rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; if (!vdev_readable(cvd)) { if (c >= rm->rm_firstdatacol) rm->rm_missingdata++; else rm->rm_missingparity++; rc->rc_error = SET_ERROR(ENXIO); rc->rc_tried = 1; /* don't even try */ rc->rc_skipped = 1; continue; } if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { if (c >= rm->rm_firstdatacol) rm->rm_missingdata++; else rm->rm_missingparity++; rc->rc_error = SET_ERROR(ESTALE); rc->rc_skipped = 1; continue; } if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } } zio_execute(zio); } /* * Report a checksum error for a child of a RAID-Z device. */ static void raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) { vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; - zfs_ereport_post_checksum(zio->io_spa, vd, + (void) zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, rc->rc_abd, bad_data, &zbc); } } /* * We keep track of whether or not there were any injected errors, so that * any ereports we generate can note it. */ static int raidz_checksum_verify(zio_t *zio) { zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd; bzero(&zbc, sizeof (zio_bad_cksum_t)); int ret = zio_checksum_error(zio, &zbc); if (ret != 0 && zbc.zbc_injected != 0) rm->rm_ecksuminjected = 1; return (ret); } /* * Generate the parity from the data columns. If we tried and were able to * read the parity without error, verify that the generated parity matches the * data we read. If it doesn't, we fire off a checksum error. Return the * number such failures. */ static int raidz_parity_verify(zio_t *zio, raidz_map_t *rm) { abd_t *orig[VDEV_RAIDZ_MAXPARITY]; int c, ret = 0; raidz_col_t *rc; blkptr_t *bp = zio->io_bp; enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); if (checksum == ZIO_CHECKSUM_NOPARITY) return (ret); for (c = 0; c < rm->rm_firstdatacol; c++) { rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size); abd_copy(orig[c], rc->rc_abd, rc->rc_size); } vdev_raidz_generate_parity(rm); for (c = 0; c < rm->rm_firstdatacol; c++) { rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; if (abd_cmp(orig[c], rc->rc_abd) != 0) { raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; } abd_free(orig[c]); } return (ret); } static int vdev_raidz_worst_error(raidz_map_t *rm) { int error = 0; for (int c = 0; c < rm->rm_cols; c++) error = zio_worst_error(error, rm->rm_col[c].rc_error); return (error); } /* * Iterate over all combinations of bad data and attempt a reconstruction. * Note that the algorithm below is non-optimal because it doesn't take into * account how reconstruction is actually performed. For example, with * triple-parity RAID-Z the reconstruction procedure is the same if column 4 * is targeted as invalid as if columns 1 and 4 are targeted since in both * cases we'd only use parity information in column 0. */ static int vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) { raidz_map_t *rm = zio->io_vsd; raidz_col_t *rc; abd_t *orig[VDEV_RAIDZ_MAXPARITY]; int tstore[VDEV_RAIDZ_MAXPARITY + 2]; int *tgts = &tstore[1]; int curr, next, i, c, n; int code, ret = 0; ASSERT(total_errors < rm->rm_firstdatacol); /* * This simplifies one edge condition. */ tgts[-1] = -1; for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { /* * Initialize the targets array by finding the first n columns * that contain no error. * * If there were no data errors, we need to ensure that we're * always explicitly attempting to reconstruct at least one * data column. To do this, we simply push the highest target * up into the data columns. */ for (c = 0, i = 0; i < n; i++) { if (i == n - 1 && data_errors == 0 && c < rm->rm_firstdatacol) { c = rm->rm_firstdatacol; } while (rm->rm_col[c].rc_error != 0) { c++; ASSERT3S(c, <, rm->rm_cols); } tgts[i] = c++; } /* * Setting tgts[n] simplifies the other edge condition. */ tgts[n] = rm->rm_cols; /* * These buffers were allocated in previous iterations. */ for (i = 0; i < n - 1; i++) { ASSERT(orig[i] != NULL); } orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd, rm->rm_col[0].rc_size); curr = 0; next = tgts[curr]; while (curr != n) { tgts[curr] = next; curr = 0; /* * Save off the original data that we're going to * attempt to reconstruct. */ for (i = 0; i < n; i++) { ASSERT(orig[i] != NULL); c = tgts[i]; ASSERT3S(c, >=, 0); ASSERT3S(c, <, rm->rm_cols); rc = &rm->rm_col[c]; abd_copy(orig[i], rc->rc_abd, rc->rc_size); } /* * Attempt a reconstruction and exit the outer loop on * success. */ code = vdev_raidz_reconstruct(rm, tgts, n); if (raidz_checksum_verify(zio) == 0) { for (i = 0; i < n; i++) { c = tgts[i]; rc = &rm->rm_col[c]; ASSERT(rc->rc_error == 0); if (rc->rc_tried) raidz_checksum_error(zio, rc, orig[i]); rc->rc_error = SET_ERROR(ECKSUM); } ret = code; goto done; } /* * Restore the original data. */ for (i = 0; i < n; i++) { c = tgts[i]; rc = &rm->rm_col[c]; abd_copy(rc->rc_abd, orig[i], rc->rc_size); } do { /* * Find the next valid column after the curr * position.. */ for (next = tgts[curr] + 1; next < rm->rm_cols && rm->rm_col[next].rc_error != 0; next++) continue; ASSERT(next <= tgts[curr + 1]); /* * If that spot is available, we're done here. */ if (next != tgts[curr + 1]) break; /* * Otherwise, find the next valid column after * the previous position. */ for (c = tgts[curr - 1] + 1; rm->rm_col[c].rc_error != 0; c++) continue; tgts[curr] = c; curr++; } while (curr != n); } } n--; done: for (i = 0; i < n; i++) abd_free(orig[i]); return (ret); } /* * Complete an IO operation on a RAIDZ VDev * * Outline: * - For write operations: * 1. Check for errors on the child IOs. * 2. Return, setting an error code if too few child VDevs were written * to reconstruct the data later. Note that partial writes are * considered successful if they can be reconstructed at all. * - For read operations: * 1. Check for errors on the child IOs. * 2. If data errors occurred: * a. Try to reassemble the data from the parity available. * b. If we haven't yet read the parity drives, read them now. * c. If all parity drives have been read but the data still doesn't * reassemble with a correct checksum, then try combinatorial * reconstruction. * d. If that doesn't work, return an error. * 3. If there were unexpected errors or this is a resilver operation, * rewrite the vdevs that had errors. */ static void vdev_raidz_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_t *cvd; raidz_map_t *rm = zio->io_vsd; raidz_col_t *rc = NULL; int unexpected_errors = 0; int parity_errors = 0; int parity_untried = 0; int data_errors = 0; int total_errors = 0; int n, c; int tgts[VDEV_RAIDZ_MAXPARITY]; int code; ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; if (rc->rc_error) { ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ if (c < rm->rm_firstdatacol) parity_errors++; else data_errors++; if (!rc->rc_skipped) unexpected_errors++; total_errors++; } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { parity_untried++; } } if (zio->io_type == ZIO_TYPE_WRITE) { /* * XXX -- for now, treat partial writes as a success. * (If we couldn't write enough columns to reconstruct * the data, the I/O failed. Otherwise, good enough.) * * Now that we support write reallocation, it would be better * to treat partial failure as real failure unless there are * no non-degraded top-level vdevs left, and not update DTLs * if we intend to reallocate. */ /* XXPOLICY */ if (total_errors > rm->rm_firstdatacol) zio->io_error = vdev_raidz_worst_error(rm); return; } ASSERT(zio->io_type == ZIO_TYPE_READ); /* * There are three potential phases for a read: * 1. produce valid data from the columns read * 2. read all disks and try again * 3. perform combinatorial reconstruction * * Each phase is progressively both more expensive and less likely to * occur. If we encounter more errors than we can repair or all phases * fail, we have no choice but to return an error. */ /* * If the number of errors we saw was correctable -- less than or equal * to the number of parity disks read -- attempt to produce data that * has a valid checksum. Naturally, this case applies in the absence of * any errors. */ if (total_errors <= rm->rm_firstdatacol - parity_untried) { if (data_errors == 0) { if (raidz_checksum_verify(zio) == 0) { /* * If we read parity information (unnecessarily * as it happens since no reconstruction was * needed) regenerate and verify the parity. * We also regenerate parity when resilvering * so we can write it out to the failed device * later. */ if (parity_errors + parity_untried < rm->rm_firstdatacol || (zio->io_flags & ZIO_FLAG_RESILVER)) { n = raidz_parity_verify(zio, rm); unexpected_errors += n; ASSERT(parity_errors + n <= rm->rm_firstdatacol); } goto done; } } else { /* * We either attempt to read all the parity columns or * none of them. If we didn't try to read parity, we * wouldn't be here in the correctable case. There must * also have been fewer parity errors than parity * columns or, again, we wouldn't be in this code path. */ ASSERT(parity_untried == 0); ASSERT(parity_errors < rm->rm_firstdatacol); /* * Identify the data columns that reported an error. */ n = 0; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; if (rc->rc_error != 0) { ASSERT(n < VDEV_RAIDZ_MAXPARITY); tgts[n++] = c; } } ASSERT(rm->rm_firstdatacol >= n); code = vdev_raidz_reconstruct(rm, tgts, n); if (raidz_checksum_verify(zio) == 0) { /* * If we read more parity disks than were used * for reconstruction, confirm that the other * parity disks produced correct data. This * routine is suboptimal in that it regenerates * the parity that we already used in addition * to the parity that we're attempting to * verify, but this should be a relatively * uncommon case, and can be optimized if it * becomes a problem. Note that we regenerate * parity when resilvering so we can write it * out to failed devices later. */ if (parity_errors < rm->rm_firstdatacol - n || (zio->io_flags & ZIO_FLAG_RESILVER)) { n = raidz_parity_verify(zio, rm); unexpected_errors += n; ASSERT(parity_errors + n <= rm->rm_firstdatacol); } goto done; } } } /* * This isn't a typical situation -- either we got a read error or * a child silently returned bad data. Read every block so we can * try again with as much data and parity as we can track down. If * we've already been through once before, all children will be marked * as tried so we'll proceed to combinatorial reconstruction. */ unexpected_errors = 1; rm->rm_missingdata = 0; rm->rm_missingparity = 0; for (c = 0; c < rm->rm_cols; c++) { if (rm->rm_col[c].rc_tried) continue; zio_vdev_io_redone(zio); do { rc = &rm->rm_col[c]; if (rc->rc_tried) continue; zio_nowait(zio_vdev_child_io(zio, NULL, vd->vdev_child[rc->rc_devidx], rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } while (++c < rm->rm_cols); return; } /* * At this point we've attempted to reconstruct the data given the * errors we detected, and we've attempted to read all columns. There * must, therefore, be one or more additional problems -- silent errors * resulting in invalid data rather than explicit I/O errors resulting * in absent data. We check if there is enough additional data to * possibly reconstruct the data and then perform combinatorial * reconstruction over all possible combinations. If that fails, * we're cooked. */ if (total_errors > rm->rm_firstdatacol) { zio->io_error = vdev_raidz_worst_error(rm); } else if (total_errors < rm->rm_firstdatacol && (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { /* * If we didn't use all the available parity for the * combinatorial reconstruction, verify that the remaining * parity is correct. */ if (code != (1 << rm->rm_firstdatacol) - 1) (void) raidz_parity_verify(zio, rm); } else { /* * We're here because either: * * total_errors == rm_first_datacol, or * vdev_raidz_combrec() failed * * In either case, there is enough bad data to prevent * reconstruction. * * Start checksum ereports for all children which haven't * failed, and the IO wasn't speculative. */ zio->io_error = SET_ERROR(ECKSUM); if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { for (c = 0; c < rm->rm_cols; c++) { vdev_t *cvd; rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; if (rc->rc_error == 0) { zio_bad_cksum_t zbc; zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; mutex_enter(&cvd->vdev_stat_lock); cvd->vdev_stat.vs_checksum_errors++; mutex_exit(&cvd->vdev_stat_lock); zfs_ereport_start_checksum( zio->io_spa, cvd, &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, (void *)(uintptr_t)c, &zbc); } } } } done: zio_checksum_verified(zio); if (zio->io_error == 0 && spa_writeable(zio->io_spa) && (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { /* * Use the good data we have in hand to repair damaged children. */ for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; if (rc->rc_error == 0) continue; zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } } static void vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) { if (faulted > vd->vdev_nparity) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); else if (degraded + faulted != 0) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); else vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } /* * Determine if any portion of the provided block resides on a child vdev * with a dirty DTL and therefore needs to be resilvered. The function * assumes that at least one DTL is dirty which implies that full stripe * width blocks must be resilvered. */ static boolean_t vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) { uint64_t dcols = vd->vdev_children; uint64_t nparity = vd->vdev_nparity; uint64_t ashift = vd->vdev_top->vdev_ashift; /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = offset >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = ((psize - 1) >> ashift) + 1; /* The first column for this stripe. */ uint64_t f = b % dcols; if (s + nparity >= dcols) return (B_TRUE); for (uint64_t c = 0; c < s + nparity; c++) { uint64_t devidx = (f + c) % dcols; vdev_t *cvd = vd->vdev_child[devidx]; /* * dsl_scan_need_resilver() already checked vd with * vdev_dtl_contains(). So here just check cvd with * vdev_dtl_empty(), cheaper and a good approximation. */ if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) return (B_TRUE); } return (B_FALSE); } static void vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) { vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); uint64_t width = raidvd->vdev_children; uint64_t tgt_col = cvd->vdev_id; uint64_t ashift = raidvd->vdev_top->vdev_ashift; /* make sure the offsets are block-aligned */ ASSERT0(in->rs_start % (1 << ashift)); ASSERT0(in->rs_end % (1 << ashift)); uint64_t b_start = in->rs_start >> ashift; uint64_t b_end = in->rs_end >> ashift; uint64_t start_row = 0; if (b_start > tgt_col) /* avoid underflow */ start_row = ((b_start - tgt_col - 1) / width) + 1; uint64_t end_row = 0; if (b_end > tgt_col) end_row = ((b_end - tgt_col - 1) / width) + 1; res->rs_start = start_row << ashift; res->rs_end = end_row << ashift; ASSERT3U(res->rs_start, <=, in->rs_start); ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); } vdev_ops_t vdev_raidz_ops = { .vdev_op_open = vdev_raidz_open, .vdev_op_close = vdev_raidz_close, .vdev_op_asize = vdev_raidz_asize, .vdev_op_io_start = vdev_raidz_io_start, .vdev_op_io_done = vdev_raidz_io_done, .vdev_op_state_change = vdev_raidz_state_change, .vdev_op_need_resilver = vdev_raidz_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_raidz_xlate, .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; Index: vendor-sys/openzfs/dist/module/zfs/zfs_fm.c =================================================================== --- vendor-sys/openzfs/dist/module/zfs/zfs_fm.c (revision 365339) +++ vendor-sys/openzfs/dist/module/zfs/zfs_fm.c (revision 365340) @@ -1,1083 +1,1083 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include /* * This general routine is responsible for generating all the different ZFS * ereports. The payload is dependent on the class, and which arguments are * supplied to the function: * * EREPORT POOL VDEV IO * block X X X * data X X * device X X * pool X * * If we are in a loading state, all errors are chained together by the same * SPA-wide ENA (Error Numeric Association). * * For isolated I/O requests, we get the ENA from the zio_t. The propagation * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want * to chain together all ereports associated with a logical piece of data. For * read I/Os, there are basically three 'types' of I/O, which form a roughly * layered diagram: * * +---------------+ * | Aggregate I/O | No associated logical data or device * +---------------+ * | * V * +---------------+ Reads associated with a piece of logical data. * | Read I/O | This includes reads on behalf of RAID-Z, * +---------------+ mirrors, gang blocks, retries, etc. * | * V * +---------------+ Reads associated with a particular device, but * | Physical I/O | no logical data. Issued as part of vdev caching * +---------------+ and I/O aggregation. * * Note that 'physical I/O' here is not the same terminology as used in the rest * of ZIO. Typically, 'physical I/O' simply means that there is no attached * blockpointer. But I/O with no associated block pointer can still be related * to a logical piece of data (i.e. RAID-Z requests). * * Purely physical I/O always have unique ENAs. They are not related to a * particular piece of logical data, and therefore cannot be chained together. * We still generate an ereport, but the DE doesn't correlate it with any * logical piece of data. When such an I/O fails, the delegated I/O requests * will issue a retry, which will trigger the 'real' ereport with the correct * ENA. * * We keep track of the ENA for a ZIO chain through the 'io_logical' member. * When a new logical I/O is issued, we set this to point to itself. Child I/Os * then inherit this pointer, so that when it is first set subsequent failures * will use the same ENA. For vdev cache fill and queue aggregation I/O, * this pointer is set to NULL, and no ereport will be generated (since it * doesn't actually correspond to any particular device or piece of data, * and the caller will always retry without caching or queueing anyway). * * For checksum errors, we want to include more information about the actual * error which occurs. Accordingly, we build an ereport when the error is * noticed, but instead of sending it in immediately, we hang it off of the * io_cksum_report field of the logical IO. When the logical IO completes * (successfully or not), zfs_ereport_finish_checksum() is called with the * good and bad versions of the buffer (if available), and we annotate the * ereport with information about the differences. */ #ifdef _KERNEL void zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) { if (nvl) fm_nvlist_destroy(nvl, FM_NVA_FREE); if (detector) fm_nvlist_destroy(detector, FM_NVA_FREE); } /* * We want to rate limit ZIO delay and checksum events so as to not * flood ZED when a disk is acting up. * * Returns 1 if we're ratelimiting, 0 if not. */ static int zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd) { int rc = 0; /* * __ratelimit() returns 1 if we're *not* ratelimiting and 0 if we * are. Invert it to get our return value. */ if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { rc = !zfs_ratelimit(&vd->vdev_delay_rl); } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { rc = !zfs_ratelimit(&vd->vdev_checksum_rl); } if (rc) { /* We're rate limiting */ fm_erpt_dropped_increment(); } return (rc); } /* * Return B_TRUE if the event actually posted, B_FALSE if not. */ static boolean_t zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, uint64_t size) { nvlist_t *ereport, *detector; uint64_t ena; char class[64]; if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) return (B_FALSE); if ((ereport = fm_nvlist_create(NULL)) == NULL) return (B_FALSE); if ((detector = fm_nvlist_create(NULL)) == NULL) { fm_nvlist_destroy(ereport, FM_NVA_FREE); return (B_FALSE); } /* * Serialize ereport generation */ mutex_enter(&spa->spa_errlist_lock); /* * Determine the ENA to use for this event. If we are in a loading * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use * a root zio-wide ENA. Otherwise, simply use a unique ENA. */ if (spa_load_state(spa) != SPA_LOAD_NONE) { if (spa->spa_ena == 0) spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); ena = spa->spa_ena; } else if (zio != NULL && zio->io_logical != NULL) { if (zio->io_logical->io_ena == 0) zio->io_logical->io_ena = fm_ena_generate(0, FM_ENA_FMT1); ena = zio->io_logical->io_ena; } else { ena = fm_ena_generate(0, FM_ENA_FMT1); } /* * Construct the full class, detector, and other standard FMA fields. */ (void) snprintf(class, sizeof (class), "%s.%s", ZFS_ERROR_CLASS, subclass); fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), vd != NULL ? vd->vdev_guid : 0); fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); /* * Construct the per-ereport payload, depending on which parameters are * passed in. */ /* * Generic payload members common to all ereports. */ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64, (uint64_t)spa_state(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, (int32_t)spa_load_state(spa), NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, DATA_TYPE_STRING, spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? FM_EREPORT_FAILMODE_WAIT : spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, NULL); if (vd != NULL) { vdev_t *pvd = vd->vdev_parent; vdev_queue_t *vq = &vd->vdev_queue; vdev_stat_t *vs = &vd->vdev_stat; vdev_t *spare_vd; uint64_t *spare_guids; char **spare_paths; int i, spare_count; fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, DATA_TYPE_UINT64, vd->vdev_guid, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); if (vd->vdev_path != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, DATA_TYPE_STRING, vd->vdev_path, NULL); if (vd->vdev_devid != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, DATA_TYPE_STRING, vd->vdev_devid, NULL); if (vd->vdev_fru != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, DATA_TYPE_STRING, vd->vdev_fru, NULL); if (vd->vdev_enc_sysfs_path != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL); if (vd->vdev_ashift) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT, DATA_TYPE_UINT64, vd->vdev_ashift, NULL); if (vq != NULL) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS, DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS, DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL); } if (vs != NULL) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS, DATA_TYPE_UINT64, vs->vs_read_errors, FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, DATA_TYPE_UINT64, vs->vs_write_errors, FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, DATA_TYPE_UINT64, vs->vs_checksum_errors, FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS, DATA_TYPE_UINT64, vs->vs_slow_ios, NULL); } if (pvd != NULL) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, DATA_TYPE_UINT64, pvd->vdev_guid, FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, NULL); if (pvd->vdev_path) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, DATA_TYPE_STRING, pvd->vdev_path, NULL); if (pvd->vdev_devid) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, DATA_TYPE_STRING, pvd->vdev_devid, NULL); } spare_count = spa->spa_spares.sav_count; spare_paths = kmem_zalloc(sizeof (char *) * spare_count, KM_SLEEP); spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count, KM_SLEEP); for (i = 0; i < spare_count; i++) { spare_vd = spa->spa_spares.sav_vdevs[i]; if (spare_vd) { spare_paths[i] = spare_vd->vdev_path; spare_guids[i] = spare_vd->vdev_guid; } } fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS, DATA_TYPE_STRING_ARRAY, spare_count, spare_paths, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS, DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL); kmem_free(spare_guids, sizeof (uint64_t) * spare_count); kmem_free(spare_paths, sizeof (char *) * spare_count); } if (zio != NULL) { /* * Payload common to all I/Os. */ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, DATA_TYPE_INT32, zio->io_error, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, DATA_TYPE_INT32, zio->io_flags, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE, DATA_TYPE_UINT32, zio->io_stage, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE, DATA_TYPE_UINT32, zio->io_pipeline, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY, DATA_TYPE_UINT64, zio->io_delay, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP, DATA_TYPE_UINT64, zio->io_timestamp, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, DATA_TYPE_UINT64, zio->io_delta, NULL); /* * If the 'size' parameter is non-zero, it indicates this is a * RAID-Z or other I/O where the physical offset and length are * provided for us, instead of within the zio_t. */ if (vd != NULL) { if (size) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, DATA_TYPE_UINT64, stateoroffset, FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, DATA_TYPE_UINT64, size, NULL); else fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, DATA_TYPE_UINT64, zio->io_offset, FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, DATA_TYPE_UINT64, zio->io_size, NULL); } } else if (vd != NULL) { /* * If we have a vdev but no zio, this is a device fault, and the * 'stateoroffset' parameter indicates the previous state of the * vdev. */ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, DATA_TYPE_UINT64, stateoroffset, NULL); } /* * Payload for I/Os with corresponding logical information. */ if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, DATA_TYPE_UINT64, zb->zb_objset, FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, DATA_TYPE_UINT64, zb->zb_object, FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, DATA_TYPE_INT64, zb->zb_level, FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, DATA_TYPE_UINT64, zb->zb_blkid, NULL); } mutex_exit(&spa->spa_errlist_lock); *ereport_out = ereport; *detector_out = detector; return (B_TRUE); } /* if it's <= 128 bytes, save the corruption directly */ #define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) #define MAX_RANGES 16 typedef struct zfs_ecksum_info { /* histograms of set and cleared bits by bit number in a 64-bit word */ uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY]; uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; /* inline arrays of bits set and cleared. */ uint64_t zei_bits_set[ZFM_MAX_INLINE]; uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; /* * for each range, the number of bits set and cleared. The Hamming * distance between the good and bad buffers is the sum of them all. */ uint32_t zei_range_sets[MAX_RANGES]; uint32_t zei_range_clears[MAX_RANGES]; struct zei_ranges { uint32_t zr_start; uint32_t zr_end; } zei_ranges[MAX_RANGES]; size_t zei_range_count; uint32_t zei_mingap; uint32_t zei_allowed_mingap; } zfs_ecksum_info_t; static void update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count) { size_t i; size_t bits = 0; uint64_t value = BE_64(value_arg); /* We store the bits in big-endian (largest-first) order */ for (i = 0; i < 64; i++) { if (value & (1ull << i)) { hist[63 - i]++; ++bits; } } /* update the count of bits changed */ *count += bits; } /* * We've now filled up the range array, and need to increase "mingap" and * shrink the range list accordingly. zei_mingap is always the smallest * distance between array entries, so we set the new_allowed_gap to be * one greater than that. We then go through the list, joining together * any ranges which are closer than the new_allowed_gap. * * By construction, there will be at least one. We also update zei_mingap * to the new smallest gap, to prepare for our next invocation. */ static void zei_shrink_ranges(zfs_ecksum_info_t *eip) { uint32_t mingap = UINT32_MAX; uint32_t new_allowed_gap = eip->zei_mingap + 1; size_t idx, output; size_t max = eip->zei_range_count; struct zei_ranges *r = eip->zei_ranges; ASSERT3U(eip->zei_range_count, >, 0); ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); output = idx = 0; while (idx < max - 1) { uint32_t start = r[idx].zr_start; uint32_t end = r[idx].zr_end; while (idx < max - 1) { idx++; uint32_t nstart = r[idx].zr_start; uint32_t nend = r[idx].zr_end; uint32_t gap = nstart - end; if (gap < new_allowed_gap) { end = nend; continue; } if (gap < mingap) mingap = gap; break; } r[output].zr_start = start; r[output].zr_end = end; output++; } ASSERT3U(output, <, eip->zei_range_count); eip->zei_range_count = output; eip->zei_mingap = mingap; eip->zei_allowed_mingap = new_allowed_gap; } static void zei_add_range(zfs_ecksum_info_t *eip, int start, int end) { struct zei_ranges *r = eip->zei_ranges; size_t count = eip->zei_range_count; if (count >= MAX_RANGES) { zei_shrink_ranges(eip); count = eip->zei_range_count; } if (count == 0) { eip->zei_mingap = UINT32_MAX; eip->zei_allowed_mingap = 1; } else { int gap = start - r[count - 1].zr_end; if (gap < eip->zei_allowed_mingap) { r[count - 1].zr_end = end; return; } if (gap < eip->zei_mingap) eip->zei_mingap = gap; } r[count].zr_start = start; r[count].zr_end = end; eip->zei_range_count++; } static size_t zei_range_total_size(zfs_ecksum_info_t *eip) { struct zei_ranges *r = eip->zei_ranges; size_t count = eip->zei_range_count; size_t result = 0; size_t idx; for (idx = 0; idx < count; idx++) result += (r[idx].zr_end - r[idx].zr_start); return (result); } static zfs_ecksum_info_t * annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, const abd_t *goodabd, const abd_t *badabd, size_t size, boolean_t drop_if_identical) { const uint64_t *good; const uint64_t *bad; uint64_t allset = 0; uint64_t allcleared = 0; size_t nui64s = size / sizeof (uint64_t); size_t inline_size; int no_inline = 0; size_t idx; size_t range; size_t offset = 0; ssize_t start = -1; zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP); /* don't do any annotation for injected checksum errors */ if (info != NULL && info->zbc_injected) return (eip); if (info != NULL && info->zbc_has_cksum) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED, DATA_TYPE_UINT64_ARRAY, sizeof (info->zbc_expected) / sizeof (uint64_t), (uint64_t *)&info->zbc_expected, FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL, DATA_TYPE_UINT64_ARRAY, sizeof (info->zbc_actual) / sizeof (uint64_t), (uint64_t *)&info->zbc_actual, FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, DATA_TYPE_STRING, info->zbc_checksum_name, NULL); if (info->zbc_byteswapped) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, DATA_TYPE_BOOLEAN, 1, NULL); } } if (badabd == NULL || goodabd == NULL) return (eip); ASSERT3U(nui64s, <=, UINT32_MAX); ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(size, <=, UINT32_MAX); good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size); bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size); /* build up the range list by comparing the two buffers. */ for (idx = 0; idx < nui64s; idx++) { if (good[idx] == bad[idx]) { if (start == -1) continue; zei_add_range(eip, start, idx); start = -1; } else { if (start != -1) continue; start = idx; } } if (start != -1) zei_add_range(eip, start, idx); /* See if it will fit in our inline buffers */ inline_size = zei_range_total_size(eip); if (inline_size > ZFM_MAX_INLINE) no_inline = 1; /* * If there is no change and we want to drop if the buffers are * identical, do so. */ if (inline_size == 0 && drop_if_identical) { kmem_free(eip, sizeof (*eip)); abd_return_buf((abd_t *)goodabd, (void *)good, size); abd_return_buf((abd_t *)badabd, (void *)bad, size); return (NULL); } /* * Now walk through the ranges, filling in the details of the * differences. Also convert our uint64_t-array offsets to byte * offsets. */ for (range = 0; range < eip->zei_range_count; range++) { size_t start = eip->zei_ranges[range].zr_start; size_t end = eip->zei_ranges[range].zr_end; for (idx = start; idx < end; idx++) { uint64_t set, cleared; // bits set in bad, but not in good set = ((~good[idx]) & bad[idx]); // bits set in good, but not in bad cleared = (good[idx] & (~bad[idx])); allset |= set; allcleared |= cleared; if (!no_inline) { ASSERT3U(offset, <, inline_size); eip->zei_bits_set[offset] = set; eip->zei_bits_cleared[offset] = cleared; offset++; } update_histogram(set, eip->zei_histogram_set, &eip->zei_range_sets[range]); update_histogram(cleared, eip->zei_histogram_cleared, &eip->zei_range_clears[range]); } /* convert to byte offsets */ eip->zei_ranges[range].zr_start *= sizeof (uint64_t); eip->zei_ranges[range].zr_end *= sizeof (uint64_t); } abd_return_buf((abd_t *)goodabd, (void *)good, size); abd_return_buf((abd_t *)badabd, (void *)bad, size); eip->zei_allowed_mingap *= sizeof (uint64_t); inline_size *= sizeof (uint64_t); /* fill in ereport */ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, (uint32_t *)eip->zei_ranges, FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, DATA_TYPE_UINT32, eip->zei_allowed_mingap, FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, NULL); if (!no_inline) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, DATA_TYPE_UINT8_ARRAY, inline_size, (uint8_t *)eip->zei_bits_set, FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, DATA_TYPE_UINT8_ARRAY, inline_size, (uint8_t *)eip->zei_bits_cleared, NULL); } else { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM, DATA_TYPE_UINT32_ARRAY, NBBY * sizeof (uint64_t), eip->zei_histogram_set, FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM, DATA_TYPE_UINT32_ARRAY, NBBY * sizeof (uint64_t), eip->zei_histogram_cleared, NULL); } return (eip); } #endif /* * Make sure our event is still valid for the given zio/vdev/pool. For example, * we don't want to keep logging events for a faulted or missing vdev. */ boolean_t zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) { #ifdef _KERNEL /* * If we are doing a spa_tryimport() or in recovery mode, * ignore errors. */ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) return (B_FALSE); /* * If we are in the middle of opening a pool, and the previous attempt * failed, don't bother logging any new ereports - we're just going to * get the same diagnosis anyway. */ if (spa_load_state(spa) != SPA_LOAD_NONE && spa->spa_last_open_failed) return (B_FALSE); if (zio != NULL) { /* * If this is not a read or write zio, ignore the error. This * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. */ if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) return (B_FALSE); if (vd != NULL) { /* * If the vdev has already been marked as failing due * to a failed probe, then ignore any subsequent I/O * errors, as the DE will automatically fault the vdev * on the first such failure. This also catches cases * where vdev_remove_wanted is set and the device has * not yet been asynchronously placed into the REMOVED * state. */ if (zio->io_vd == vd && !vdev_accessible(vd, zio)) return (B_FALSE); /* * Ignore checksum errors for reads from DTL regions of * leaf vdevs. */ if (zio->io_type == ZIO_TYPE_READ && zio->io_error == ECKSUM && vd->vdev_ops->vdev_op_leaf && vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) return (B_FALSE); } } /* * For probe failure, we want to avoid posting ereports if we've * already removed the device in the meantime. */ if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) return (B_FALSE); /* Ignore bogus delay events (like from ioctls or unqueued IOs) */ if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && (zio != NULL) && (!zio->io_timestamp)) { return (B_FALSE); } #endif return (B_TRUE); } /* * Return 0 if event was posted, EINVAL if there was a problem posting it or * EBUSY if the event was rate limited. */ int zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, uint64_t size) { int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; if (zfs_is_ratelimiting_event(subclass, vd)) return (SET_ERROR(EBUSY)); if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, zb, zio, stateoroffset, size)) return (SET_ERROR(EINVAL)); /* couldn't post event */ if (ereport == NULL) return (SET_ERROR(EINVAL)); /* Cleanup is handled by the callback function */ rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); #endif return (rc); } void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, void *arg, zio_bad_cksum_t *info) { zio_cksum_report_t *report; #ifdef _KERNEL if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) return; #endif report = kmem_zalloc(sizeof (*report), KM_SLEEP); if (zio->io_vsd != NULL) zio->io_vsd_ops->vsd_cksum_report(zio, report, arg); else zio_vsd_default_cksum_report(zio, report, arg); /* copy the checksum failure information if it was provided */ if (info != NULL) { report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); bcopy(info, report->zcr_ckinfo, sizeof (*info)); } report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift; report->zcr_length = length; #ifdef _KERNEL - zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, + (void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length); if (report->zcr_ereport == NULL) { zfs_ereport_free_checksum(report); return; } #endif mutex_enter(&spa->spa_errlist_lock); report->zcr_next = zio->io_logical->io_cksum_report; zio->io_logical->io_cksum_report = report; mutex_exit(&spa->spa_errlist_lock); } void zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, const abd_t *bad_data, boolean_t drop_if_identical) { #ifdef _KERNEL zfs_ecksum_info_t *info; info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, good_data, bad_data, report->zcr_length, drop_if_identical); if (info != NULL) zfs_zevent_post(report->zcr_ereport, report->zcr_detector, zfs_zevent_post_cb); else zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector); report->zcr_ereport = report->zcr_detector = NULL; if (info != NULL) kmem_free(info, sizeof (*info)); #endif } void zfs_ereport_free_checksum(zio_cksum_report_t *rpt) { #ifdef _KERNEL if (rpt->zcr_ereport != NULL) { fm_nvlist_destroy(rpt->zcr_ereport, FM_NVA_FREE); fm_nvlist_destroy(rpt->zcr_detector, FM_NVA_FREE); } #endif rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); if (rpt->zcr_ckinfo != NULL) kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); kmem_free(rpt, sizeof (*rpt)); } int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc) { int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; zfs_ecksum_info_t *info; if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) return (EBUSY); if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length) || (ereport == NULL)) { return (SET_ERROR(EINVAL)); } info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, B_FALSE); if (info != NULL) { rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); kmem_free(info, sizeof (*info)); } #endif return (rc); } /* * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of * change in the pool. All sysevents are listed in sys/sysevent/eventdefs.h * and are designed to be consumed by the ZFS Event Daemon (ZED). For * additional details refer to the zed(8) man page. */ nvlist_t * zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, nvlist_t *aux) { nvlist_t *resource = NULL; #ifdef _KERNEL char class[64]; if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) return (NULL); if ((resource = fm_nvlist_create(NULL)) == NULL) return (NULL); (void) snprintf(class, sizeof (class), "%s.%s.%s", type, ZFS_ERROR_CLASS, name); VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION)); VERIFY0(nvlist_add_string(resource, FM_CLASS, class)); VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa))); VERIFY0(nvlist_add_uint64(resource, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa))); VERIFY0(nvlist_add_uint64(resource, FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa))); VERIFY0(nvlist_add_int32(resource, FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa))); if (vd) { VERIFY0(nvlist_add_uint64(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid)); VERIFY0(nvlist_add_uint64(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state)); if (vd->vdev_path != NULL) VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path)); if (vd->vdev_devid != NULL) VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid)); if (vd->vdev_fru != NULL) VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru)); if (vd->vdev_enc_sysfs_path != NULL) VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, vd->vdev_enc_sysfs_path)); } /* also copy any optional payload data */ if (aux) { nvpair_t *elem = NULL; while ((elem = nvlist_next_nvpair(aux, elem)) != NULL) (void) nvlist_add_nvpair(resource, elem); } #endif return (resource); } static void zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name, nvlist_t *aux) { #ifdef _KERNEL nvlist_t *resource; resource = zfs_event_create(spa, vd, type, name, aux); if (resource) zfs_zevent_post(resource, NULL, zfs_zevent_post_cb); #endif } /* * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev * has been removed from the system. This will cause the DE to ignore any * recent I/O errors, inferring that they are due to the asynchronous device * removal. */ void zfs_post_remove(spa_t *spa, vdev_t *vd) { zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, NULL); } /* * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool * has the 'autoreplace' property set, and therefore any broken vdevs will be * handled by higher level logic, and no vdev fault should be generated. */ void zfs_post_autoreplace(spa_t *spa, vdev_t *vd) { zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL); } /* * The 'resource.fs.zfs.statechange' event is an internal signal that the * given vdev has transitioned its state to DEGRADED or HEALTHY. This will * cause the retire agent to repair any outstanding fault management cases * open because the device was not found (fault.fs.zfs.device). */ void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) { #ifdef _KERNEL nvlist_t *aux; /* * Add optional supplemental keys to payload */ aux = fm_nvlist_create(NULL); if (vd && aux) { if (vd->vdev_physpath) { (void) nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH, vd->vdev_physpath); } if (vd->vdev_enc_sysfs_path) { (void) nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, vd->vdev_enc_sysfs_path); } (void) nvlist_add_uint64(aux, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate); } zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE, aux); if (aux) fm_nvlist_destroy(aux, FM_NVA_FREE); #endif } #if defined(_KERNEL) EXPORT_SYMBOL(zfs_ereport_post); EXPORT_SYMBOL(zfs_ereport_is_valid); EXPORT_SYMBOL(zfs_ereport_post_checksum); EXPORT_SYMBOL(zfs_post_remove); EXPORT_SYMBOL(zfs_post_autoreplace); EXPORT_SYMBOL(zfs_post_state_change); #endif /* _KERNEL */ Index: vendor-sys/openzfs/dist/module/zfs/zfs_ioctl.c =================================================================== --- vendor-sys/openzfs/dist/module/zfs/zfs_ioctl.c (revision 365339) +++ vendor-sys/openzfs/dist/module/zfs/zfs_ioctl.c (revision 365340) @@ -1,7629 +1,7630 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Portions Copyright 2012 Pawel Jakub Dawidek * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2018, loli10K . All rights reserved. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude */ /* * ZFS ioctls. * * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool. * * There are two ways that we handle ioctls: the legacy way where almost * all of the logic is in the ioctl callback, and the new way where most * of the marshalling is handled in the common entry point, zfsdev_ioctl(). * * Non-legacy ioctls should be registered by calling * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked * from userland by lzc_ioctl(). * * The registration arguments are as follows: * * const char *name * The name of the ioctl. This is used for history logging. If the * ioctl returns successfully (the callback returns 0), and allow_log * is true, then a history log entry will be recorded with the input & * output nvlists. The log entry can be printed with "zpool history -i". * * zfs_ioc_t ioc * The ioctl request number, which userland will pass to ioctl(2). * We want newer versions of libzfs and libzfs_core to run against * existing zfs kernel modules (i.e. a deferred reboot after an update). * Therefore the ioctl numbers cannot change from release to release. * * zfs_secpolicy_func_t *secpolicy * This function will be called before the zfs_ioc_func_t, to * determine if this operation is permitted. It should return EPERM * on failure, and 0 on success. Checks include determining if the * dataset is visible in this zone, and if the user has either all * zfs privileges in the zone (SYS_MOUNT), or has been granted permission * to do this operation on this dataset with "zfs allow". * * zfs_ioc_namecheck_t namecheck * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool * name, a dataset name, or nothing. If the name is not well-formed, * the ioctl will fail and the callback will not be called. * Therefore, the callback can assume that the name is well-formed * (e.g. is null-terminated, doesn't have more than one '@' character, * doesn't have invalid characters). * * zfs_ioc_poolcheck_t pool_check * This specifies requirements on the pool state. If the pool does * not meet them (is suspended or is readonly), the ioctl will fail * and the callback will not be called. If any checks are specified * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME. * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | * POOL_CHECK_READONLY). * * zfs_ioc_key_t *nvl_keys * The list of expected/allowable innvl input keys. This list is used * to validate the nvlist input to the ioctl. * * boolean_t smush_outnvlist * If smush_outnvlist is true, then the output is presumed to be a * list of errors, and it will be "smushed" down to fit into the * caller's buffer, by removing some entries and replacing them with a * single "N_MORE_ERRORS" entry indicating how many were removed. See * nvlist_smush() for details. If smush_outnvlist is false, and the * outnvlist does not fit into the userland-provided buffer, then the * ioctl will fail with ENOMEM. * * zfs_ioc_func_t *func * The callback function that will perform the operation. * * The callback should return 0 on success, or an error number on * failure. If the function fails, the userland ioctl will return -1, * and errno will be set to the callback's return value. The callback * will be called with the following arguments: * * const char *name * The name of the pool or dataset to operate on, from * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the * expected type (pool, dataset, or none). * * nvlist_t *innvl * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or * NULL if no input nvlist was provided. Changes to this nvlist are * ignored. If the input nvlist could not be deserialized, the * ioctl will fail and the callback will not be called. * * nvlist_t *outnvl * The output nvlist, initially empty. The callback can fill it in, * and it will be returned to userland by serializing it into * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization * fails (e.g. because the caller didn't supply a large enough * buffer), then the overall ioctl will fail. See the * 'smush_nvlist' argument above for additional behaviors. * * There are two typical uses of the output nvlist: * - To return state, e.g. property values. In this case, * smush_outnvlist should be false. If the buffer was not large * enough, the caller will reallocate a larger buffer and try * the ioctl again. * * - To return multiple errors from an ioctl which makes on-disk * changes. In this case, smush_outnvlist should be true. * Ioctls which make on-disk modifications should generally not * use the outnvl if they succeed, because the caller can not * distinguish between the operation failing, and * deserialization failing. * * IOCTL Interface Errors * * The following ioctl input errors can be returned: * ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel * ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel * ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing * ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_deleg.h" #include "zfs_comutil.h" #include #include #include kmutex_t zfsdev_state_lock; zfsdev_state_t *zfsdev_state_list; /* * Limit maximum nvlist size. We don't want users passing in insane values * for zc->zc_nvlist_src_size, since we will need to allocate that much memory. * Defaults to 0=auto which is handled by platform code. */ unsigned long zfs_max_nvlist_src_size = 0; uint_t zfs_fsyncer_key; uint_t zfs_allow_log_key; /* DATA_TYPE_ANY is used when zkey_type can vary. */ #define DATA_TYPE_ANY DATA_TYPE_UNKNOWN typedef struct zfs_ioc_vec { zfs_ioc_legacy_func_t *zvec_legacy_func; zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; zfs_ioc_namecheck_t zvec_namecheck; boolean_t zvec_allow_log; zfs_ioc_poolcheck_t zvec_pool_check; boolean_t zvec_smush_outnvlist; const char *zvec_name; const zfs_ioc_key_t *zvec_nvl_keys; size_t zvec_nvl_key_count; } zfs_ioc_vec_t; /* This array is indexed by zfs_userquota_prop_t */ static const char *userquota_perms[] = { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_PERM_USEROBJUSED, ZFS_DELEG_PERM_USEROBJQUOTA, ZFS_DELEG_PERM_GROUPOBJUSED, ZFS_DELEG_PERM_GROUPOBJQUOTA, ZFS_DELEG_PERM_PROJECTUSED, ZFS_DELEG_PERM_PROJECTQUOTA, ZFS_DELEG_PERM_PROJECTOBJUSED, ZFS_DELEG_PERM_PROJECTOBJQUOTA, }; static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc); static int zfs_check_settable(const char *name, nvpair_t *property, cred_t *cr); static int zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errors); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); static void history_str_free(char *buf) { kmem_free(buf, HIS_MAX_RECORD_LEN); } static char * history_str_get(zfs_cmd_t *zc) { char *buf; if (zc->zc_history == 0) return (NULL); buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); if (copyinstr((void *)(uintptr_t)zc->zc_history, buf, HIS_MAX_RECORD_LEN, NULL) != 0) { history_str_free(buf); return (NULL); } buf[HIS_MAX_RECORD_LEN -1] = '\0'; return (buf); } /* * Return non-zero if the spa version is less than requested version. */ static int zfs_earlier_version(const char *name, int version) { spa_t *spa; if (spa_open(name, &spa, FTAG) == 0) { if (spa_version(spa) < version) { spa_close(spa, FTAG); return (1); } spa_close(spa, FTAG); } return (0); } /* * Return TRUE if the ZPL version is less than requested version. */ static boolean_t zpl_earlier_version(const char *name, int version) { objset_t *os; boolean_t rc = B_TRUE; if (dmu_objset_hold(name, FTAG, &os) == 0) { uint64_t zplversion; if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (B_TRUE); } /* XXX reading from non-owned objset */ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) rc = zplversion < version; dmu_objset_rele(os, FTAG); } return (rc); } static void zfs_log_history(zfs_cmd_t *zc) { spa_t *spa; char *buf; if ((buf = history_str_get(zc)) == NULL) return; if (spa_open(zc->zc_name, &spa, FTAG) == 0) { if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) (void) spa_history_log(spa, buf); spa_close(spa, FTAG); } history_str_free(buf); } /* * Policy for top-level read operations (list pools). Requires no privileges, * and can be used in the local zone, as there is no associated dataset. */ /* ARGSUSED */ static int zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (0); } /* * Policy for dataset read operations (list children, get statistics). Requires * no privileges, but must be visible in the local zone. */ /* ARGSUSED */ static int zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (INGLOBALZONE(curproc) || zone_dataset_visible(zc->zc_name, NULL)) return (0); return (SET_ERROR(ENOENT)); } static int zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) { int writable = 1; /* * The dataset must be visible by this zone -- check this first * so they don't see EPERM on something they shouldn't know about. */ if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable)) return (SET_ERROR(ENOENT)); if (INGLOBALZONE(curproc)) { /* * If the fs is zoned, only root can access it from the * global zone. */ if (secpolicy_zfs(cr) && zoned) return (SET_ERROR(EPERM)); } else { /* * If we are in a local zone, the 'zoned' property must be set. */ if (!zoned) return (SET_ERROR(EPERM)); /* must be writable by this zone */ if (!writable) return (SET_ERROR(EPERM)); } return (0); } static int zfs_dozonecheck(const char *dataset, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_integer(dataset, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, const char *perm, cred_t *cr) { int error; error = zfs_dozonecheck_ds(name, ds, cr); if (error == 0) { error = secpolicy_zfs(cr); if (error != 0) error = dsl_deleg_access_impl(ds, perm, cr); } return (error); } static int zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { int error; dsl_dataset_t *ds; dsl_pool_t *dp; /* * First do a quick check for root in the global zone, which * is allowed to do all write_perms. This ensures that zfs_ioc_* * will get to handle nonexistent datasets. */ if (INGLOBALZONE(curproc) && secpolicy_zfs(cr) == 0) return (0); error = dsl_pool_hold(name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * Policy for setting the security label property. * * Returns 0 for success, non-zero for access and other errors. */ static int zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) { #ifdef HAVE_MLSLABEL char ds_hexsl[MAXNAMELEN]; bslabel_t ds_sl, new_sl; boolean_t new_default = FALSE; uint64_t zoned; int needed_priv = -1; int error; /* First get the existing dataset label. */ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error != 0) return (SET_ERROR(EPERM)); if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) new_default = TRUE; /* The label must be translatable */ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) return (SET_ERROR(EINVAL)); /* * In a non-global zone, disallow attempts to set a label that * doesn't match that of the zone; otherwise no other checks * are needed. */ if (!INGLOBALZONE(curproc)) { if (new_default || !blequal(&new_sl, CR_SL(CRED()))) return (SET_ERROR(EPERM)); return (0); } /* * For global-zone datasets (i.e., those whose zoned property is * "off", verify that the specified new label is valid for the * global zone. */ if (dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(EPERM)); if (!zoned) { if (zfs_check_global_label(name, strval) != 0) return (SET_ERROR(EPERM)); } /* * If the existing dataset label is nondefault, check if the * dataset is mounted (label cannot be changed while mounted). * Get the zfsvfs_t; if there isn't one, then the dataset isn't * mounted (or isn't a dataset, doesn't exist, ...). */ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { objset_t *os; static char *setsl_tag = "setsl_tag"; /* * Try to own the dataset; abort if there is any error, * (e.g., already mounted, in use, or other error). */ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, B_TRUE, setsl_tag, &os); if (error != 0) return (SET_ERROR(EPERM)); dmu_objset_disown(os, B_TRUE, setsl_tag); if (new_default) { needed_priv = PRIV_FILE_DOWNGRADE_SL; goto out_check; } if (hexstr_to_label(strval, &new_sl) != 0) return (SET_ERROR(EPERM)); if (blstrictdom(&ds_sl, &new_sl)) needed_priv = PRIV_FILE_DOWNGRADE_SL; else if (blstrictdom(&new_sl, &ds_sl)) needed_priv = PRIV_FILE_UPGRADE_SL; } else { /* dataset currently has a default label */ if (!new_default) needed_priv = PRIV_FILE_UPGRADE_SL; } out_check: if (needed_priv != -1) return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); return (0); #else return (SET_ERROR(ENOTSUP)); #endif /* HAVE_MLSLABEL */ } static int zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, cred_t *cr) { char *strval; /* * Check permissions for special properties. */ switch (prop) { default: break; case ZFS_PROP_ZONED: /* * Disallow setting of 'zoned' from within a local zone. */ if (!INGLOBALZONE(curproc)) return (SET_ERROR(EPERM)); break; case ZFS_PROP_QUOTA: case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (!INGLOBALZONE(curproc)) { uint64_t zoned; char setpoint[ZFS_MAX_DATASET_NAME_LEN]; /* * Unprivileged users are allowed to modify the * limit on things *under* (ie. contained by) * the thing they own. */ if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, setpoint)) return (SET_ERROR(EPERM)); if (!zoned || strlen(dsname) <= strlen(setpoint)) return (SET_ERROR(EPERM)); } break; case ZFS_PROP_MLSLABEL: if (!is_system_labeled()) return (SET_ERROR(EPERM)); if (nvpair_value_string(propval, &strval) == 0) { int err; err = zfs_set_slabel_policy(dsname, strval, CRED()); if (err != 0) return (err); } break; } return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); } /* ARGSUSED */ static int zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; error = zfs_dozonecheck(zc->zc_name, cr); if (error != 0) return (error); /* * permission to set permissions will be evaluated later in * dsl_deleg_can_allow() */ return (0); } /* ARGSUSED */ static int zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_ROLLBACK, cr)); } /* ARGSUSED */ static int zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { dsl_pool_t *dp; dsl_dataset_t *ds; char *cp; int error; /* * Generate the current snapshot name from the given objsetid, then * use that name for the secpolicy/zone checks. */ cp = strchr(zc->zc_name, '@'); if (cp == NULL) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ds, zc->zc_name); error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* ARGSUSED */ static int zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_SEND, cr)); } static int zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (SET_ERROR(ENOTSUP)); } static int zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (SET_ERROR(ENOTSUP)); } static int zfs_get_parent(const char *datasetname, char *parent, int parentsize) { char *cp; /* * Remove the @bla or /bla from the end of the name to get the parent. */ (void) strncpy(parent, datasetname, parentsize); cp = strrchr(parent, '@'); if (cp != NULL) { cp[0] = '\0'; } else { cp = strrchr(parent, '/'); if (cp == NULL) return (SET_ERROR(ENOENT)); cp[0] = '\0'; } return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); } /* ARGSUSED */ static int zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); } /* * Destroying snapshots with delegated permissions requires * descendant mount and destroy permissions. */ /* ARGSUSED */ static int zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvlist_t *snaps; nvpair_t *pair, *nextpair; int error = 0; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nextpair) { nextpair = nvlist_next_nvpair(snaps, pair); error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr); if (error == ENOENT) { /* * Ignore any snapshots that don't exist (we consider * them "already destroyed"). Remove the name from the * nvl here in case the snapshot is created between * now and when we try to destroy it (in which case * we don't want to destroy it since we haven't * checked for permission). */ fnvlist_remove_nvpair(snaps, pair); error = 0; } if (error != 0) break; } return (error); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_RENAME, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); if ((error = zfs_get_parent(to, parentname, sizeof (parentname))) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (error); } /* ARGSUSED */ static int zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr)); } /* ARGSUSED */ static int zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { dsl_pool_t *dp; dsl_dataset_t *clone; int error; error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_PROMOTE, cr); if (error != 0) return (error); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); if (error == 0) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_t *origin = NULL; dsl_dir_t *dd; dd = clone->ds_dir; error = dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin); if (error != 0) { dsl_dataset_rele(clone, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, ZFS_DELEG_PERM_MOUNT, cr); dsl_dataset_name(origin, parentname); if (error == 0) { error = zfs_secpolicy_write_perms_ds(parentname, origin, ZFS_DELEG_PERM_PROMOTE, cr); } dsl_dataset_rele(clone, FTAG); dsl_dataset_rele(origin, FTAG); } dsl_pool_rele(dp, FTAG); return (error); } /* ARGSUSED */ static int zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RECEIVE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CREATE, cr)); } /* ARGSUSED */ static int zfs_secpolicy_recv_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_recv(zc, innvl, cr)); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_SNAPSHOT, cr)); } /* * Check for permission to create each snapshot in the nvlist. */ /* ARGSUSED */ static int zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvlist_t *snaps; int error = 0; nvpair_t *pair; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char *name = nvpair_name(pair); char *atp = strchr(name, '@'); if (atp == NULL) { error = SET_ERROR(EINVAL); break; } *atp = '\0'; error = zfs_secpolicy_snapshot_perms(name, cr); *atp = '@'; if (error != 0) break; } return (error); } /* * Check for permission to create each bookmark in the nvlist. */ /* ARGSUSED */ static int zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error = 0; for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_BOOKMARK, cr); *hashp = '#'; if (error != 0) break; } return (error); } /* ARGSUSED */ static int zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair, *nextpair; int error = 0; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nextpair) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); nextpair = nvlist_next_nvpair(innvl, pair); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr); *hashp = '#'; if (error == ENOENT) { /* * Ignore any filesystems that don't exist (we consider * their bookmarks "already destroyed"). Remove * the name from the nvl here in case the filesystem * is created between now and when we try to destroy * the bookmark (in which case we don't want to * destroy it since we haven't checked for permission). */ fnvlist_remove_nvpair(innvl, pair); error = 0; } if (error != 0) break; } return (error); } /* ARGSUSED */ static int zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * Even root must have a proper TSD so that we know what pool * to log to. */ if (tsd_get(zfs_allow_log_key) == NULL) return (SET_ERROR(EPERM)); return (0); } static int zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; char *origin; if ((error = zfs_get_parent(zc->zc_name, parentname, sizeof (parentname))) != 0) return (error); if (nvlist_lookup_string(innvl, "origin", &origin) == 0 && (error = zfs_secpolicy_write_perms(origin, ZFS_DELEG_PERM_CLONE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)); } /* * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires * SYS_CONFIG privilege, which is not available in a local zone. */ /* ARGSUSED */ int zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (secpolicy_sys_config(cr, B_FALSE) != 0) return (SET_ERROR(EPERM)); return (0); } /* * Policy for object to name lookups. */ /* ARGSUSED */ static int zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) return (0); error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); return (error); } /* * Policy for fault injection. Requires all privileges. */ /* ARGSUSED */ static int zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (secpolicy_zinject(cr)); } /* ARGSUSED */ static int zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); if (prop == ZPROP_INVAL) { if (!zfs_prop_user(zc->zc_value)) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_USERPROP, cr)); } else { return (zfs_secpolicy_setprop(zc->zc_name, prop, NULL, cr)); } } static int zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); if (zc->zc_value[0] == 0) { /* * They are asking about a posix uid/gid. If it's * themself, allow it. */ if (zc->zc_objset_type == ZFS_PROP_USERUSED || zc->zc_objset_type == ZFS_PROP_USERQUOTA || zc->zc_objset_type == ZFS_PROP_USEROBJUSED || zc->zc_objset_type == ZFS_PROP_USEROBJQUOTA) { if (zc->zc_guid == crgetuid(cr)) return (0); } else if (zc->zc_objset_type == ZFS_PROP_GROUPUSED || zc->zc_objset_type == ZFS_PROP_GROUPQUOTA || zc->zc_objset_type == ZFS_PROP_GROUPOBJUSED || zc->zc_objset_type == ZFS_PROP_GROUPOBJQUOTA) { if (groupmember(zc->zc_guid, cr)) return (0); } /* else is for project quota/used */ } return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } static int zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } /* ARGSUSED */ static int zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, NULL, cr)); } /* ARGSUSED */ static int zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair; nvlist_t *holds; int error; holds = fnvlist_lookup_nvlist(innvl, "holds"); for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_HOLD, cr); if (error != 0) return (error); } return (0); } /* ARGSUSED */ static int zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair; int error; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_RELEASE, cr); if (error != 0) return (error); } return (0); } /* * Policy for allowing temporary snapshots to be taken or released */ static int zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * A temporary snapshot is the same as a snapshot, * hold, destroy and release all rolled into one. * Delegated diff alone is sufficient that we allow this. */ int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr)) == 0) return (0); error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); if (innvl != NULL) { if (error == 0) error = zfs_secpolicy_hold(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_release(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_destroy(zc, innvl, cr); } return (error); } static int zfs_secpolicy_load_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_LOAD_KEY, cr)); } static int zfs_secpolicy_change_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CHANGE_KEY, cr)); } /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) { char *packed; int error; nvlist_t *list = NULL; /* * Read in and unpack the user-supplied nvlist. */ if (size == 0) return (SET_ERROR(EINVAL)); packed = vmem_alloc(size, KM_SLEEP); if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag)) != 0) { vmem_free(packed, size); return (SET_ERROR(EFAULT)); } if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { vmem_free(packed, size); return (error); } vmem_free(packed, size); *nvp = list; return (0); } /* * Reduce the size of this nvlist until it can be serialized in 'max' bytes. * Entries will be removed from the end of the nvlist, and one int32 entry * named "N_MORE_ERRORS" will be added indicating how many entries were * removed. */ static int nvlist_smush(nvlist_t *errors, size_t max) { size_t size; size = fnvlist_size(errors); if (size > max) { nvpair_t *more_errors; int n = 0; if (max < 1024) return (SET_ERROR(ENOMEM)); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0); more_errors = nvlist_prev_nvpair(errors, NULL); do { nvpair_t *pair = nvlist_prev_nvpair(errors, more_errors); fnvlist_remove_nvpair(errors, pair); n++; size = fnvlist_size(errors); } while (size > max); fnvlist_remove_nvpair(errors, more_errors); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n); ASSERT3U(fnvlist_size(errors), <=, max); } return (0); } static int put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) { char *packed = NULL; int error = 0; size_t size; size = fnvlist_size(nvl); if (size > zc->zc_nvlist_dst_size) { error = SET_ERROR(ENOMEM); } else { packed = fnvlist_pack(nvl, &size); if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) error = SET_ERROR(EFAULT); fnvlist_pack_free(packed, size); } zc->zc_nvlist_dst_size = size; zc->zc_nvlist_dst_filled = B_TRUE; return (error); } int getzfsvfs_impl(objset_t *os, zfsvfs_t **zfvp) { int error = 0; if (dmu_objset_type(os) != DMU_OST_ZFS) { return (SET_ERROR(EINVAL)); } mutex_enter(&os->os_user_ptr_lock); *zfvp = dmu_objset_get_user(os); /* bump s_active only when non-zero to prevent umount race */ error = zfs_vfs_ref(zfvp); mutex_exit(&os->os_user_ptr_lock); return (error); } int getzfsvfs(const char *dsname, zfsvfs_t **zfvp) { objset_t *os; int error; error = dmu_objset_hold(dsname, FTAG, &os); if (error != 0) return (error); error = getzfsvfs_impl(os, zfvp); dmu_objset_rele(os, FTAG); return (error); } /* * Find a zfsvfs_t for a mounted filesystem, or create our own, in which * case its z_sb will be NULL, and it will be opened as the owner. * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, * which prevents all inode ops from running. */ static int zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) { int error = 0; if (getzfsvfs(name, zfvp) != 0) error = zfsvfs_create(name, B_FALSE, zfvp); if (error == 0) { rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER : RW_READER, tag); if ((*zfvp)->z_unmounted) { /* * XXX we could probably try again, since the unmounting * thread should be just about to disassociate the * objset from the zfsvfs. */ rrm_exit(&(*zfvp)->z_teardown_lock, tag); return (SET_ERROR(EBUSY)); } } return (error); } static void zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) { rrm_exit(&zfsvfs->z_teardown_lock, tag); if (zfs_vfs_held(zfsvfs)) { zfs_vfs_rele(zfsvfs); } else { dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } } static int zfs_ioc_pool_create(zfs_cmd_t *zc) { int error; nvlist_t *config, *props = NULL; nvlist_t *rootprops = NULL; nvlist_t *zplprops = NULL; dsl_crypto_params_t *dcp = NULL; char *spa_name = zc->zc_name; boolean_t unload_wkey = B_TRUE; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config))) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (props) { nvlist_t *nvl = NULL; nvlist_t *hidden_args = NULL; uint64_t version = SPA_VERSION; char *tname; (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); if (!SPA_VERSION_IS_SUPPORTED(version)) { error = SET_ERROR(EINVAL); goto pool_props_bad; } (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); if (nvl) { error = nvlist_dup(nvl, &rootprops, KM_SLEEP); if (error != 0) goto pool_props_bad; (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); } (void) nvlist_lookup_nvlist(props, ZPOOL_HIDDEN_ARGS, &hidden_args); error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, rootprops, hidden_args, &dcp); if (error != 0) goto pool_props_bad; (void) nvlist_remove_all(props, ZPOOL_HIDDEN_ARGS); VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops_root(version, rootprops, zplprops, NULL); if (error != 0) goto pool_props_bad; if (nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0) spa_name = tname; } error = spa_create(zc->zc_name, config, props, zplprops, dcp); /* * Set the remaining root properties */ if (!error && (error = zfs_set_prop_nvlist(spa_name, ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) { (void) spa_destroy(spa_name); unload_wkey = B_FALSE; /* spa_destroy() unloads wrapping keys */ } pool_props_bad: nvlist_free(rootprops); nvlist_free(zplprops); nvlist_free(config); nvlist_free(props); dsl_crypto_params_free(dcp, unload_wkey && !!error); return (error); } static int zfs_ioc_pool_destroy(zfs_cmd_t *zc) { int error; zfs_log_history(zc); error = spa_destroy(zc->zc_name); return (error); } static int zfs_ioc_pool_import(zfs_cmd_t *zc) { nvlist_t *config, *props = NULL; uint64_t guid; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) != 0) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != zc->zc_guid) error = SET_ERROR(EINVAL); else error = spa_import(zc->zc_name, config, props, zc->zc_cookie); if (zc->zc_nvlist_dst != 0) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; } nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_pool_export(zfs_cmd_t *zc) { int error; boolean_t force = (boolean_t)zc->zc_cookie; boolean_t hardforce = (boolean_t)zc->zc_guid; zfs_log_history(zc); error = spa_export(zc->zc_name, NULL, force, hardforce); return (error); } static int zfs_ioc_pool_configs(zfs_cmd_t *zc) { nvlist_t *configs; int error; if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) return (SET_ERROR(EEXIST)); error = put_nvlist(zc, configs); nvlist_free(configs); return (error); } /* * inputs: * zc_name name of the pool * * outputs: * zc_cookie real errno * zc_nvlist_dst config nvlist * zc_nvlist_dst_size size of config nvlist */ static int zfs_ioc_pool_stats(zfs_cmd_t *zc) { nvlist_t *config; int error; int ret = 0; error = spa_get_stats(zc->zc_name, &config, zc->zc_value, sizeof (zc->zc_value)); if (config != NULL) { ret = put_nvlist(zc, config); nvlist_free(config); /* * The config may be present even if 'error' is non-zero. * In this case we return success, and preserve the real errno * in 'zc_cookie'. */ zc->zc_cookie = error; } else { ret = error; } return (ret); } /* * Try to import the given pool, returning pool stats as appropriate so that * user land knows which devices are available and overall pool health. */ static int zfs_ioc_pool_tryimport(zfs_cmd_t *zc) { nvlist_t *tryconfig, *config = NULL; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &tryconfig)) != 0) return (error); config = spa_tryimport(tryconfig); nvlist_free(tryconfig); if (config == NULL) return (SET_ERROR(EINVAL)); error = put_nvlist(zc, config); nvlist_free(config); return (error); } /* * inputs: * zc_name name of the pool * zc_cookie scan func (pool_scan_func_t) * zc_flags scrub pause/resume flag (pool_scrub_cmd_t) */ static int zfs_ioc_pool_scan(zfs_cmd_t *zc) { spa_t *spa; int error; if (zc->zc_flags >= POOL_SCRUB_FLAGS_END) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_flags == POOL_SCRUB_PAUSE) error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); else if (zc->zc_cookie == POOL_SCAN_NONE) error = spa_scan_stop(spa); else error = spa_scan(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_freeze(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { spa_freeze(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_pool_upgrade(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_cookie < spa_version(spa) || !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } spa_upgrade(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_history(zfs_cmd_t *zc) { spa_t *spa; char *hist_buf; uint64_t size; int error; if ((size = zc->zc_history_len) == 0) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } hist_buf = vmem_alloc(size, KM_SLEEP); if ((error = spa_history_get(spa, &zc->zc_history_offset, &zc->zc_history_len, hist_buf)) == 0) { error = ddi_copyout(hist_buf, (void *)(uintptr_t)zc->zc_history, zc->zc_history_len, zc->zc_iflags); } spa_close(spa, FTAG); vmem_free(hist_buf, size); return (error); } static int zfs_ioc_pool_reguid(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { error = spa_change_guid(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) { return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_value name of object */ static int zfs_ioc_obj_to_path(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele_flags(os, B_TRUE, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_stat stats on object * zc_value path to object */ static int zfs_ioc_obj_to_stats(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele_flags(os, B_TRUE, FTAG); return (error); } static int zfs_ioc_vdev_add(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *config; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config); if (error == 0) { error = spa_vdev_add(spa, config); nvlist_free(config); } spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of the pool * zc_guid guid of vdev to remove * zc_cookie cancel removal */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); if (zc->zc_cookie != 0) { error = spa_vdev_remove_cancel(spa); } else { error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_set_state(zfs_cmd_t *zc) { spa_t *spa; int error; vdev_state_t newstate = VDEV_STATE_UNKNOWN; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); switch (zc->zc_cookie) { case VDEV_STATE_ONLINE: error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); break; case VDEV_STATE_OFFLINE: error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_FAULTED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL && zc->zc_obj != VDEV_AUX_EXTERNAL_PERSIST) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_DEGRADED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); break; default: error = SET_ERROR(EINVAL); } zc->zc_cookie = newstate; spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_attach(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config; int replacing = zc->zc_cookie; int rebuild = zc->zc_simple; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) == 0) { error = spa_vdev_attach(spa, zc->zc_guid, config, replacing, rebuild); nvlist_free(config); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_detach(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_split(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config, *props = NULL; int error; boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config))) { spa_close(spa, FTAG); return (error); } if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { spa_close(spa, FTAG); nvlist_free(config); return (error); } error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); spa_close(spa, FTAG); nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_vdev_setpath(zfs_cmd_t *zc) { spa_t *spa; char *path = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setpath(spa, guid, path); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_setfru(zfs_cmd_t *zc) { spa_t *spa; char *fru = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setfru(spa, guid, fru); spa_close(spa, FTAG); return (error); } static int zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) { int error = 0; nvlist_t *nv; dmu_objset_fast_stat(os, &zc->zc_objset_stats); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_all(os, &nv)) == 0) { dmu_objset_stats(os, nv); /* * NB: zvol_get_stats() will read the objset contents, * which we aren't supposed to do with a * DS_MODE_USER hold, because it could be * inconsistent. So this is a bit of a workaround... * XXX reading without owning */ if (!zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZVOL) { error = zvol_get_stats(os, nv); if (error == EIO) { nvlist_free(nv); return (error); } VERIFY0(error); } if (error == 0) error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error == 0) { error = zfs_ioc_objset_stats_impl(zc, os); dmu_objset_rele(os, FTAG); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_nvlist_dst received property nvlist * zc_nvlist_dst_size size of received property nvlist * * Gets received properties (distinct from local properties on or after * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from * local property values. */ static int zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) { int error = 0; nvlist_t *nv; /* * Without this check, we would return local property values if the * caller has not already received properties on or after * SPA_VERSION_RECVD_PROPS. */ if (!dsl_prop_get_hasrecvd(zc->zc_name)) return (SET_ERROR(ENOTSUP)); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } static int nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) { uint64_t value; int error; /* * zfs_get_zplprop() will either find a value or give us * the default value (if there is one). */ if ((error = zfs_get_zplprop(os, prop, &value)) != 0) return (error); VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); return (0); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for zpl property nvlist * * outputs: * zc_nvlist_dst zpl property nvlist * zc_nvlist_dst_size size of zpl property nvlist */ static int zfs_ioc_objset_zplprops(zfs_cmd_t *zc) { objset_t *os; int err; /* XXX reading without owning */ if ((err = dmu_objset_hold(zc->zc_name, FTAG, &os))) return (err); dmu_objset_fast_stat(os, &zc->zc_objset_stats); /* * NB: nvl_add_zplprop() will read the objset contents, * which we aren't supposed to do with a DS_MODE_USER * hold, because it could be inconsistent. */ if (zc->zc_nvlist_dst != 0 && !zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZFS) { nvlist_t *nv; VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) err = put_nvlist(zc, nv); nvlist_free(nv); } else { err = SET_ERROR(ENOENT); } dmu_objset_rele(os, FTAG); return (err); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_name name of next filesystem * zc_cookie zap cursor * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_dataset_list_next(zfs_cmd_t *zc) { objset_t *os; int error; char *p; size_t orig_len = strlen(zc->zc_name); top: if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) { if (error == ENOENT) error = SET_ERROR(ESRCH); return (error); } p = strrchr(zc->zc_name, '/'); if (p == NULL || p[1] != '\0') (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); do { error = dmu_dir_list_next(os, sizeof (zc->zc_name) - (p - zc->zc_name), p, NULL, &zc->zc_cookie); if (error == ENOENT) error = SET_ERROR(ESRCH); } while (error == 0 && zfs_dataset_name_hidden(zc->zc_name)); dmu_objset_rele(os, FTAG); /* * If it's an internal dataset (ie. with a '$' in its name), * don't try to get stats for it, otherwise we'll return ENOENT. */ if (error == 0 && strchr(zc->zc_name, '$') == NULL) { error = zfs_ioc_objset_stats(zc); /* fill in the stats */ if (error == ENOENT) { /* We lost a race with destroy, get the next one. */ zc->zc_name[orig_len] = '\0'; goto top; } } return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_src iteration range nvlist * zc_nvlist_src_size size of iteration range nvlist * * outputs: * zc_name name of next snapshot * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { int error; objset_t *os, *ossnap; dsl_dataset_t *ds; uint64_t min_txg = 0, max_txg = 0; if (zc->zc_nvlist_src_size != 0) { nvlist_t *props = NULL; error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props); if (error != 0) return (error); (void) nvlist_lookup_uint64(props, SNAP_ITER_MIN_TXG, &min_txg); (void) nvlist_lookup_uint64(props, SNAP_ITER_MAX_TXG, &max_txg); nvlist_free(props); } error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) { return (error == ENOENT ? SET_ERROR(ESRCH) : error); } /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. */ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= ZFS_MAX_DATASET_NAME_LEN) { dmu_objset_rele(os, FTAG); return (SET_ERROR(ESRCH)); } while (error == 0) { if (issig(JUSTLOOKING) && issig(FORREAL)) { error = SET_ERROR(EINTR); break; } error = dmu_snapshot_list_next(os, sizeof (zc->zc_name) - strlen(zc->zc_name), zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie, NULL); if (error == ENOENT) { error = SET_ERROR(ESRCH); break; } else if (error != 0) { break; } error = dsl_dataset_hold_obj(dmu_objset_pool(os), zc->zc_obj, FTAG, &ds); if (error != 0) break; if ((min_txg != 0 && dsl_get_creationtxg(ds) < min_txg) || (max_txg != 0 && dsl_get_creationtxg(ds) > max_txg)) { dsl_dataset_rele(ds, FTAG); /* undo snapshot name append */ *(strchr(zc->zc_name, '@') + 1) = '\0'; /* skip snapshot */ continue; } if (zc->zc_simple) { dsl_dataset_rele(ds, FTAG); break; } if ((error = dmu_objset_from_ds(ds, &ossnap)) != 0) { dsl_dataset_rele(ds, FTAG); break; } if ((error = zfs_ioc_objset_stats_impl(zc, ossnap)) != 0) { dsl_dataset_rele(ds, FTAG); break; } dsl_dataset_rele(ds, FTAG); break; } dmu_objset_rele(os, FTAG); /* if we failed, undo the @ that we tacked on to zc_name */ if (error != 0) *strchr(zc->zc_name, '@') = '\0'; return (error); } static int zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) { const char *propname = nvpair_name(pair); uint64_t *valary; unsigned int vallen; const char *domain; char *dash; zfs_userquota_prop_t type; uint64_t rid; uint64_t quota; zfsvfs_t *zfsvfs; int err; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) != 0) return (SET_ERROR(EINVAL)); } /* * A correctly constructed propname is encoded as * userquota@-. */ if ((dash = strchr(propname, '-')) == NULL || nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || vallen != 3) return (SET_ERROR(EINVAL)); domain = dash + 1; type = valary[0]; rid = valary[1]; quota = valary[2]; err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); if (err == 0) { err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); zfsvfs_rele(zfsvfs, FTAG); } return (err); } /* * If the named property is one that has a special function to set its value, * return 0 on success and a positive error code on failure; otherwise if it is * not one of the special properties handled by this function, return -1. * * XXX: It would be better for callers of the property interface if we handled * these special cases in dsl_prop.c (in the dsl layer). */ static int zfs_prop_set_special(const char *dsname, zprop_source_t source, nvpair_t *pair) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval = 0; char *strval = NULL; int err = -1; if (prop == ZPROP_INVAL) { if (zfs_prop_userquota(propname)) return (zfs_prop_set_userquota(dsname, pair)); return (-1); } if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* all special properties are numeric except for keylocation */ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { strval = fnvpair_value_string(pair); } else { intval = fnvpair_value_uint64(pair); } switch (prop) { case ZFS_PROP_QUOTA: err = dsl_dir_set_quota(dsname, source, intval); break; case ZFS_PROP_REFQUOTA: err = dsl_dataset_set_refquota(dsname, source, intval); break; case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (intval == UINT64_MAX) { /* clearing the limit, just do it */ err = 0; } else { err = dsl_dir_activate_fs_ss_limit(dsname); } /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_KEYLOCATION: err = dsl_crypto_can_set_keylocation(dsname, strval); /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; case ZFS_PROP_REFRESERVATION: err = dsl_dataset_set_refreservation(dsname, source, intval); break; case ZFS_PROP_COMPRESSION: err = dsl_dataset_set_compression(dsname, source, intval); /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_VOLSIZE: err = zvol_set_volsize(dsname, intval); break; case ZFS_PROP_SNAPDEV: err = zvol_set_snapdev(dsname, source, intval); break; case ZFS_PROP_VOLMODE: err = zvol_set_volmode(dsname, source, intval); break; case ZFS_PROP_VERSION: { zfsvfs_t *zfsvfs; if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) break; err = zfs_set_version(zfsvfs, intval); zfsvfs_rele(zfsvfs, FTAG); if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { zfs_cmd_t *zc; zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strlcpy(zc->zc_name, dsname, sizeof (zc->zc_name)); (void) zfs_ioc_userspace_upgrade(zc); (void) zfs_ioc_id_quota_upgrade(zc); kmem_free(zc, sizeof (zfs_cmd_t)); } break; } default: err = -1; } return (err); } /* * This function is best effort. If it fails to set any of the given properties, * it continues to set as many as it can and returns the last error * encountered. If the caller provides a non-NULL errlist, it will be filled in * with the list of names of all the properties that failed along with the * corresponding error numbers. * * If every property is set successfully, zero is returned and errlist is not * modified. */ int zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, nvlist_t *errlist) { nvpair_t *pair; nvpair_t *propval; int rv = 0; uint64_t intval; char *strval; nvlist_t *genericnvl = fnvlist_alloc(); nvlist_t *retrynvl = fnvlist_alloc(); retry: pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); int err = 0; /* decode the property value */ propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &propval) != 0) err = SET_ERROR(EINVAL); } /* Validate value type */ if (err == 0 && source == ZPROP_SRC_INHERITED) { /* inherited properties are expected to be booleans */ if (nvpair_type(propval) != DATA_TYPE_BOOLEAN) err = SET_ERROR(EINVAL); } else if (err == 0 && prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if (nvpair_type(propval) != DATA_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (zfs_prop_userquota(propname)) { if (nvpair_type(propval) != DATA_TYPE_UINT64_ARRAY) err = SET_ERROR(EINVAL); } else { err = SET_ERROR(EINVAL); } } else if (err == 0) { if (nvpair_type(propval) == DATA_TYPE_STRING) { if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { const char *unused; intval = fnvpair_value_uint64(propval); switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: break; case PROP_TYPE_STRING: err = SET_ERROR(EINVAL); break; case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, intval, &unused) != 0) err = SET_ERROR(ZFS_ERR_BADPROP); break; default: cmn_err(CE_PANIC, "unknown property type"); } } else { err = SET_ERROR(EINVAL); } } /* Validate permissions */ if (err == 0) err = zfs_check_settable(dsname, pair, CRED()); if (err == 0) { if (source == ZPROP_SRC_INHERITED) err = -1; /* does not need special handling */ else err = zfs_prop_set_special(dsname, source, pair); if (err == -1) { /* * For better performance we build up a list of * properties to set in a single transaction. */ err = nvlist_add_nvpair(genericnvl, pair); } else if (err != 0 && nvl != retrynvl) { /* * This may be a spurious error caused by * receiving quota and reservation out of order. * Try again in a second pass. */ err = nvlist_add_nvpair(retrynvl, pair); } } if (err != 0) { if (errlist != NULL) fnvlist_add_int32(errlist, propname, err); rv = err; } } if (nvl != retrynvl && !nvlist_empty(retrynvl)) { nvl = retrynvl; goto retry; } if (!nvlist_empty(genericnvl) && dsl_props_set(dsname, source, genericnvl) != 0) { /* * If this fails, we still want to set as many properties as we * can, so try setting them individually. */ pair = NULL; while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { const char *propname = nvpair_name(pair); int err = 0; propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); propval = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); } if (nvpair_type(propval) == DATA_TYPE_STRING) { strval = fnvpair_value_string(propval); err = dsl_prop_set_string(dsname, propname, source, strval); } else if (nvpair_type(propval) == DATA_TYPE_BOOLEAN) { err = dsl_prop_inherit(dsname, propname, source); } else { intval = fnvpair_value_uint64(propval); err = dsl_prop_set_int(dsname, propname, source, intval); } if (err != 0) { if (errlist != NULL) { fnvlist_add_int32(errlist, propname, err); } rv = err; } } } nvlist_free(genericnvl); nvlist_free(retrynvl); return (rv); } /* * Check that all the properties are valid user properties. */ static int zfs_check_userprops(nvlist_t *nvl) { nvpair_t *pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); if (!zfs_prop_user(propname) || nvpair_type(pair) != DATA_TYPE_STRING) return (SET_ERROR(EINVAL)); if (strlen(propname) >= ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN) return (SET_ERROR(E2BIG)); } return (0); } static void props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) { nvpair_t *pair; VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); pair = NULL; while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { if (nvlist_exists(skipped, nvpair_name(pair))) continue; VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); } } static int clear_received_props(const char *dsname, nvlist_t *props, nvlist_t *skipped) { int err = 0; nvlist_t *cleared_props = NULL; props_skip(props, skipped, &cleared_props); if (!nvlist_empty(cleared_props)) { /* * Acts on local properties until the dataset has received * properties at least once on or after SPA_VERSION_RECVD_PROPS. */ zprop_source_t flags = (ZPROP_SRC_NONE | (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); } nvlist_free(cleared_props); return (err); } /* * inputs: * zc_name name of filesystem * zc_value name of property to set * zc_nvlist_src{_size} nvlist of properties to apply * zc_cookie received properties flag * * outputs: * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_set_prop(zfs_cmd_t *zc) { nvlist_t *nvl; boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : ZPROP_SRC_LOCAL); nvlist_t *errors; int error; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvl)) != 0) return (error); if (received) { nvlist_t *origprops; if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { (void) clear_received_props(zc->zc_name, origprops, nvl); nvlist_free(origprops); } error = dsl_prop_set_hasrecvd(zc->zc_name); } errors = fnvlist_alloc(); if (error == 0) error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); if (zc->zc_nvlist_dst != 0 && errors != NULL) { (void) put_nvlist(zc, errors); } nvlist_free(errors); nvlist_free(nvl); return (error); } /* * inputs: * zc_name name of filesystem * zc_value name of property to inherit * zc_cookie revert to received value if TRUE * * outputs: none */ static int zfs_ioc_inherit_prop(zfs_cmd_t *zc) { const char *propname = zc->zc_value; zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_NONE /* revert to received value, if any */ : ZPROP_SRC_INHERITED); /* explicitly inherit */ nvlist_t *dummy; nvpair_t *pair; zprop_type_t type; int err; if (!received) { /* * Only check this in the non-received case. We want to allow * 'inherit -S' to revert non-inheritable properties like quota * and reservation to the received or default values even though * they are not considered inheritable. */ if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) return (SET_ERROR(EINVAL)); } if (prop == ZPROP_INVAL) { if (!zfs_prop_user(propname)) return (SET_ERROR(EINVAL)); type = PROP_TYPE_STRING; } else if (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION) { return (SET_ERROR(EINVAL)); } else { type = zfs_prop_get_type(prop); } /* * zfs_prop_set_special() expects properties in the form of an * nvpair with type info. */ dummy = fnvlist_alloc(); switch (type) { case PROP_TYPE_STRING: VERIFY(0 == nvlist_add_string(dummy, propname, "")); break; case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); break; default: err = SET_ERROR(EINVAL); goto errout; } pair = nvlist_next_nvpair(dummy, NULL); if (pair == NULL) { err = SET_ERROR(EINVAL); } else { err = zfs_prop_set_special(zc->zc_name, source, pair); if (err == -1) /* property is not "special", needs handling */ err = dsl_prop_inherit(zc->zc_name, zc->zc_value, source); } errout: nvlist_free(dummy); return (err); } static int zfs_ioc_pool_set_props(zfs_cmd_t *zc) { nvlist_t *props; spa_t *spa; int error; nvpair_t *pair; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) return (error); /* * If the only property is the configfile, then just do a spa_lookup() * to handle the faulted case. */ pair = nvlist_next_nvpair(props, NULL); if (pair != NULL && strcmp(nvpair_name(pair), zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && nvlist_next_nvpair(props, pair) == NULL) { mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE); } mutex_exit(&spa_namespace_lock); if (spa != NULL) { nvlist_free(props); return (0); } } if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { nvlist_free(props); return (error); } error = spa_prop_set(spa, props); nvlist_free(props); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_props(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *nvp = NULL; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) error = spa_prop_get(spa, &nvp); mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, &nvp); spa_close(spa, FTAG); } if (error == 0 && zc->zc_nvlist_dst != 0) error = put_nvlist(zc, nvp); else error = SET_ERROR(EFAULT); nvlist_free(nvp); return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_src{_size} nvlist of delegated permissions * zc_perm_action allow/unallow flag * * outputs: none */ static int zfs_ioc_set_fsacl(zfs_cmd_t *zc) { int error; nvlist_t *fsaclnv = NULL; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &fsaclnv)) != 0) return (error); /* * Verify nvlist is constructed correctly */ if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { nvlist_free(fsaclnv); return (SET_ERROR(EINVAL)); } /* * If we don't have PRIV_SYS_MOUNT, then validate * that user is allowed to hand out each permission in * the nvlist(s) */ error = secpolicy_zfs(CRED()); if (error != 0) { if (zc->zc_perm_action == B_FALSE) { error = dsl_deleg_can_allow(zc->zc_name, fsaclnv, CRED()); } else { error = dsl_deleg_can_unallow(zc->zc_name, fsaclnv, CRED()); } } if (error == 0) error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); nvlist_free(fsaclnv); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_nvlist_src{_size} nvlist of delegated permissions */ static int zfs_ioc_get_fsacl(zfs_cmd_t *zc) { nvlist_t *nvp; int error; if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { error = put_nvlist(zc, nvp); nvlist_free(nvp); } return (error); } /* ARGSUSED */ static void zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; zfs_create_fs(os, cr, zct->zct_zplprops, tx); } #define ZFS_PROP_UNDEFINED ((uint64_t)-1) /* * inputs: * os parent objset pointer (NULL if root fs) * fuids_ok fuids allowed in this version of the spa? * sa_ok SAs allowed in this version of the spa? * createprops list of properties requested by creator * * outputs: * zplprops values for the zplprops we attach to the master node object * is_ci true if requested file system will be purely case-insensitive * * Determine the settings for utf8only, normalization and * casesensitivity. Specific values may have been requested by the * creator and/or we can inherit values from the parent dataset. If * the file system is of too early a vintage, a creator can not * request settings for these properties, even if the requested * setting is the default value. We don't actually want to create dsl * properties for these, so remove them from the source nvlist after * processing. */ static int zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; int error; ASSERT(zplprops != NULL); /* parent dataset must be a filesystem */ if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS) return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); /* * Pull out creator prop choices, if any. */ if (createprops) { (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_CASE), &sense); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_CASE)); } /* * If the zpl version requested is whacky or the file system * or pool is version is too "young" to support normalization * and the creator tried to set a value for one of the props, * error out. */ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || (zplver >= ZPL_VERSION_FUID && !fuids_ok) || (zplver >= ZPL_VERSION_SA && !sa_ok) || (zplver < ZPL_VERSION_NORMALIZATION && (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || sense != ZFS_PROP_UNDEFINED))) return (SET_ERROR(ENOTSUP)); /* * Put the version in the zplprops */ VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); if (norm == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); /* * If we're normalizing, names must always be valid UTF-8 strings. */ if (norm) u8 = 1; if (u8 == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); if (sense == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); if (is_ci) *is_ci = (sense == ZFS_CASE_INSENSITIVE); return (0); } static int zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok, sa_ok; uint64_t zplver = ZPL_VERSION; objset_t *os = NULL; char parentname[ZFS_MAX_DATASET_NAME_LEN]; spa_t *spa; uint64_t spa_vers; int error; zfs_get_parent(dataset, parentname, sizeof (parentname)); if ((error = spa_open(dataset, &spa, FTAG)) != 0) return (error); spa_vers = spa_version(spa); spa_close(spa, FTAG); zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); /* * Open parent object set so we can inherit zplprop values. */ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) return (error); error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); dmu_objset_rele(os, FTAG); return (error); } static int zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok; boolean_t sa_ok; uint64_t zplver = ZPL_VERSION; int error; zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); return (error); } /* * innvl: { * "type" -> dmu_objset_type_t (int32) * (optional) "props" -> { prop -> value } * (optional) "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * } * * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_create[] = { {"type", DATA_TYPE_INT32, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; zfs_creat_t zct = { 0 }; nvlist_t *nvprops = NULL; nvlist_t *hidden_args = NULL; void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); dmu_objset_type_t type; boolean_t is_insensitive = B_FALSE; dsl_crypto_params_t *dcp = NULL; type = (dmu_objset_type_t)fnvlist_lookup_int32(innvl, "type"); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); switch (type) { case DMU_OST_ZFS: cbfunc = zfs_create_cb; break; case DMU_OST_ZVOL: cbfunc = zvol_create_cb; break; default: cbfunc = NULL; break; } if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); zct.zct_props = nvprops; if (cbfunc == NULL) return (SET_ERROR(EINVAL)); if (type == DMU_OST_ZVOL) { uint64_t volsize, volblocksize; if (nvprops == NULL) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) return (SET_ERROR(EINVAL)); if ((error = nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize)) != 0 && error != ENOENT) return (SET_ERROR(EINVAL)); if (error != 0) volblocksize = zfs_prop_default_numeric( ZFS_PROP_VOLBLOCKSIZE); if ((error = zvol_check_volblocksize(fsname, volblocksize)) != 0 || (error = zvol_check_volsize(volsize, volblocksize)) != 0) return (error); } else if (type == DMU_OST_ZFS) { int error; /* * We have to have normalization and * case-folding flags correct when we do the * file system creation, so go figure them out * now. */ VERIFY(nvlist_alloc(&zct.zct_zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops(fsname, nvprops, zct.zct_zplprops, &is_insensitive); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } } error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, nvprops, hidden_args, &dcp); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } error = dmu_objset_create(fsname, type, is_insensitive ? DS_FLAG_CI_DATASET : 0, dcp, cbfunc, &zct); nvlist_free(zct.zct_zplprops); dsl_crypto_params_free(dcp, !!error); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) { spa_t *spa; int error2; /* * Volumes will return EBUSY and cannot be destroyed * until all asynchronous minor handling (e.g. from * setting the volmode property) has completed. Wait for * the spa_zvol_taskq to drain then retry. */ error2 = dsl_destroy_head(fsname); while ((error2 == EBUSY) && (type == DMU_OST_ZVOL)) { error2 = spa_open(fsname, &spa, FTAG); if (error2 == 0) { taskq_wait(spa->spa_zvol_taskq); spa_close(spa, FTAG); } error2 = dsl_destroy_head(fsname); } } } return (error); } /* * innvl: { * "origin" -> name of origin snapshot * (optional) "props" -> { prop -> value } * (optional) "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * } * * outputs: * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_clone[] = { {"origin", DATA_TYPE_STRING, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; nvlist_t *nvprops = NULL; char *origin_name; origin_name = fnvlist_lookup_string(innvl, "origin"); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); if (dataset_namecheck(origin_name, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); error = dmu_objset_clone(fsname, origin_name); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) (void) dsl_destroy_head(fsname); } return (error); } static const zfs_ioc_key_t zfs_keys_remap[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { /* This IOCTL is no longer supported. */ return (0); } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional) "props" -> { prop -> value (string) } * } * * outnvl: snapshot -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_snapshot[] = { {"snaps", DATA_TYPE_NVLIST, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { nvlist_t *snaps; nvlist_t *props = NULL; int error, poollen; nvpair_t *pair; (void) nvlist_lookup_nvlist(innvl, "props", &props); if (!nvlist_empty(props) && zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) return (SET_ERROR(ENOTSUP)); if ((error = zfs_check_userprops(props)) != 0) return (error); snaps = fnvlist_lookup_nvlist(innvl, "snaps"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); char *cp = strchr(name, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The snap must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); /* * Check for permission to set the properties on the fs. */ if (!nvlist_empty(props)) { *cp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_USERPROP, CRED()); *cp = '@'; if (error != 0) return (error); } /* This must be the only snap of this fs. */ for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { if (strncmp(name, nvpair_name(pair2), cp - name + 1) == 0) { return (SET_ERROR(EXDEV)); } } } error = dsl_dataset_snapshot(snaps, props, outnvl); return (error); } /* * innvl: "message" -> string */ static const zfs_ioc_key_t zfs_keys_log_history[] = { {"message", DATA_TYPE_STRING, 0}, }; /* ARGSUSED */ static int zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) { char *message; spa_t *spa; int error; char *poolname; /* * The poolname in the ioctl is not set, we get it from the TSD, * which was set at the end of the last successful ioctl that allows * logging. The secpolicy func already checked that it is set. * Only one log ioctl is allowed after each successful ioctl, so * we clear the TSD here. */ poolname = tsd_get(zfs_allow_log_key); if (poolname == NULL) return (SET_ERROR(EINVAL)); (void) tsd_set(zfs_allow_log_key, NULL); error = spa_open(poolname, &spa, FTAG); kmem_strfree(poolname); if (error != 0) return (error); message = fnvlist_lookup_string(innvl, "message"); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } error = spa_history_log(spa, message); spa_close(spa, FTAG); return (error); } /* * This ioctl is used to set the bootenv configuration on the current * pool. This configuration is stored in the second padding area of the label, * and it is used by the GRUB bootloader used on Linux to store the contents * of the grubenv file. The file is stored as raw ASCII, and is protected by * an embedded checksum. By default, GRUB will check if the boot filesystem * supports storing the environment data in a special location, and if so, * will invoke filesystem specific logic to retrieve it. This can be overridden * by a variable, should the user so desire. */ /* ARGSUSED */ static const zfs_ioc_key_t zfs_keys_set_bootenv[] = { {"envmap", DATA_TYPE_STRING, 0}, }; static int zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { char *envmap; int error; spa_t *spa; envmap = fnvlist_lookup_string(innvl, "envmap"); if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); spa_vdev_state_enter(spa, SCL_ALL); error = vdev_label_write_bootenv(spa->spa_root_vdev, envmap); (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (error); } static const zfs_ioc_key_t zfs_keys_get_bootenv[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); spa_vdev_state_enter(spa, SCL_ALL); error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl); (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (error); } /* * The dp_config_rwlock must not be held when calling this, because the * unmount may need to write out data. * * This function is best-effort. Callers must deal gracefully if it * remains mounted (or is remounted after this call). * * Returns 0 if the argument is not a snapshot, or it is not currently a * filesystem, or we were able to unmount it. Returns error code otherwise. */ void zfs_unmount_snap(const char *snapname) { if (strchr(snapname, '@') == NULL) return; (void) zfsctl_snapshot_unmount((char *)snapname, MNT_FORCE); } /* ARGSUSED */ static int zfs_unmount_snap_cb(const char *snapname, void *arg) { zfs_unmount_snap(snapname); return (0); } /* * When a clone is destroyed, its origin may also need to be destroyed, * in which case it must be unmounted. This routine will do that unmount * if necessary. */ void zfs_destroy_unmount_origin(const char *fsname) { int error; objset_t *os; dsl_dataset_t *ds; error = dmu_objset_hold(fsname, FTAG, &os); if (error != 0) return; ds = dmu_objset_ds(os); if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { char originname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds->ds_prev, originname); dmu_objset_rele(os, FTAG); zfs_unmount_snap(originname); } else { dmu_objset_rele(os, FTAG); } } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional boolean) "defer" * } * * outnvl: snapshot -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = { {"snaps", DATA_TYPE_NVLIST, 0}, {"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int poollen; nvlist_t *snaps; nvpair_t *pair; boolean_t defer; spa_t *spa; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); defer = nvlist_exists(innvl, "defer"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); /* * The snap must be in the specified pool to prevent the * invalid removal of zvol minors below. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); zfs_unmount_snap(nvpair_name(pair)); if (spa_open(name, &spa, FTAG) == 0) { zvol_remove_minors(spa, name, B_TRUE); spa_close(spa, FTAG); } } return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); } /* * Create bookmarks. The bookmark names are of the form #. * All bookmarks and snapshots must be in the same pool. * dsl_bookmark_create_nvl_validate describes the nvlist schema in more detail. * * innvl: { * new_bookmark1 -> existing_snapshot, * new_bookmark2 -> existing_bookmark, * } * * outnvl: bookmark -> error code (int32) * */ static const zfs_ioc_key_t zfs_keys_bookmark[] = { {"...", DATA_TYPE_STRING, ZK_WILDCARDLIST}, }; /* ARGSUSED */ static int zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { return (dsl_bookmark_create(innvl, outnvl)); } /* * innvl: { * property 1, property 2, ... * } * * outnvl: { * bookmark name 1 -> { property 1, property 2, ... }, * bookmark name 2 -> { property 1, property 2, ... } * } * */ static const zfs_ioc_key_t zfs_keys_get_bookmarks[] = { {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST | ZK_OPTIONAL}, }; static int zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { return (dsl_get_bookmarks(fsname, innvl, outnvl)); } /* * innvl is not used. * * outnvl: { * property 1, property 2, ... * } * */ static const zfs_ioc_key_t zfs_keys_get_bookmark_props[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_get_bookmark_props(const char *bookmark, nvlist_t *innvl, nvlist_t *outnvl) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *bmname; bmname = strchr(bookmark, '#'); if (bmname == NULL) return (SET_ERROR(EINVAL)); bmname++; (void) strlcpy(fsname, bookmark, sizeof (fsname)); *(strchr(fsname, '#')) = '\0'; return (dsl_get_bookmark_props(fsname, bmname, outnvl)); } /* * innvl: { * bookmark name 1, bookmark name 2 * } * * outnvl: bookmark -> error code (int32) * */ static const zfs_ioc_key_t zfs_keys_destroy_bookmarks[] = { {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST}, }; static int zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int error, poollen; poollen = strlen(poolname); for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { const char *name = nvpair_name(pair); const char *cp = strchr(name, '#'); /* * The bookmark name must contain an #, and the part after it * must contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The bookmark must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '#')) return (SET_ERROR(EXDEV)); } error = dsl_bookmark_destroy(innvl, outnvl); return (error); } static const zfs_ioc_key_t zfs_keys_channel_program[] = { {"program", DATA_TYPE_STRING, 0}, {"arg", DATA_TYPE_ANY, 0}, {"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, {"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { char *program; uint64_t instrlimit, memlimit; boolean_t sync_flag; nvpair_t *nvarg = NULL; program = fnvlist_lookup_string(innvl, ZCP_ARG_PROGRAM); if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) { sync_flag = B_TRUE; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) { instrlimit = ZCP_DEFAULT_INSTRLIMIT; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) { memlimit = ZCP_DEFAULT_MEMLIMIT; } nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST); if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit) return (SET_ERROR(EINVAL)); if (memlimit == 0 || memlimit > zfs_lua_max_memlimit) return (SET_ERROR(EINVAL)); return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit, nvarg, outnvl)); } /* * innvl: unused * outnvl: empty */ static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { return (spa_checkpoint(poolname)); } /* * innvl: unused * outnvl: empty */ static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { return (spa_checkpoint_discard(poolname)); } /* * inputs: * zc_name name of dataset to destroy * zc_defer_destroy mark for deferred destroy * * outputs: none */ static int zfs_ioc_destroy(zfs_cmd_t *zc) { objset_t *os; dmu_objset_type_t ost; int err; err = dmu_objset_hold(zc->zc_name, FTAG, &os); if (err != 0) return (err); ost = dmu_objset_type(os); dmu_objset_rele(os, FTAG); if (ost == DMU_OST_ZFS) zfs_unmount_snap(zc->zc_name); if (strchr(zc->zc_name, '@')) { err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); } else { err = dsl_destroy_head(zc->zc_name); if (err == EEXIST) { /* * It is possible that the given DS may have * hidden child (%recv) datasets - "leftovers" * resulting from the previously interrupted * 'zfs receive'. * * 6 extra bytes for /%recv */ char namebuf[ZFS_MAX_DATASET_NAME_LEN + 6]; if (snprintf(namebuf, sizeof (namebuf), "%s/%s", zc->zc_name, recv_clone_name) >= sizeof (namebuf)) return (SET_ERROR(EINVAL)); /* * Try to remove the hidden child (%recv) and after * that try to remove the target dataset. * If the hidden child (%recv) does not exist * the original error (EEXIST) will be returned */ err = dsl_destroy_head(namebuf); if (err == 0) err = dsl_destroy_head(zc->zc_name); else if (err == ENOENT) err = SET_ERROR(EEXIST); } } return (err); } /* * innvl: { * "initialize_command" -> POOL_INITIALIZE_{CANCEL|START|SUSPEND} (uint64) * "initialize_vdevs": { -> guids to initialize (nvlist) * "vdev_path_1": vdev_guid_1, (uint64), * "vdev_path_2": vdev_guid_2, (uint64), * ... * }, * } * * outnvl: { * "initialize_vdevs": { -> initialization errors (nvlist) * "vdev_path_1": errno, see function body for possible errnos (uint64) * "vdev_path_2": errno, ... (uint64) * ... * } * } * * EINVAL is returned for an unknown commands or if any of the provided vdev * guids have be specified with a type other than uint64. */ static const zfs_ioc_key_t zfs_keys_pool_initialize[] = { {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0}, {ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0} }; static int zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { uint64_t cmd_type; if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND, &cmd_type) != 0) { return (SET_ERROR(EINVAL)); } if (!(cmd_type == POOL_INITIALIZE_CANCEL || cmd_type == POOL_INITIALIZE_START || cmd_type == POOL_INITIALIZE_SUSPEND)) { return (SET_ERROR(EINVAL)); } nvlist_t *vdev_guids; if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS, &vdev_guids) != 0) { return (SET_ERROR(EINVAL)); } for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { uint64_t vdev_guid; if (nvpair_value_uint64(pair, &vdev_guid) != 0) { return (SET_ERROR(EINVAL)); } } spa_t *spa; int error = spa_open(poolname, &spa, FTAG); if (error != 0) return (error); nvlist_t *vdev_errlist = fnvlist_alloc(); int total_errors = spa_vdev_initialize(spa, vdev_guids, cmd_type, vdev_errlist); if (fnvlist_size(vdev_errlist) > 0) { fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS, vdev_errlist); } fnvlist_free(vdev_errlist); spa_close(spa, FTAG); return (total_errors > 0 ? EINVAL : 0); } /* * innvl: { * "trim_command" -> POOL_TRIM_{CANCEL|START|SUSPEND} (uint64) * "trim_vdevs": { -> guids to TRIM (nvlist) * "vdev_path_1": vdev_guid_1, (uint64), * "vdev_path_2": vdev_guid_2, (uint64), * ... * }, * "trim_rate" -> Target TRIM rate in bytes/sec. * "trim_secure" -> Set to request a secure TRIM. * } * * outnvl: { * "trim_vdevs": { -> TRIM errors (nvlist) * "vdev_path_1": errno, see function body for possible errnos (uint64) * "vdev_path_2": errno, ... (uint64) * ... * } * } * * EINVAL is returned for an unknown commands or if any of the provided vdev * guids have be specified with a type other than uint64. */ static const zfs_ioc_key_t zfs_keys_pool_trim[] = { {ZPOOL_TRIM_COMMAND, DATA_TYPE_UINT64, 0}, {ZPOOL_TRIM_VDEVS, DATA_TYPE_NVLIST, 0}, {ZPOOL_TRIM_RATE, DATA_TYPE_UINT64, ZK_OPTIONAL}, {ZPOOL_TRIM_SECURE, DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, }; static int zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { uint64_t cmd_type; if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_COMMAND, &cmd_type) != 0) return (SET_ERROR(EINVAL)); if (!(cmd_type == POOL_TRIM_CANCEL || cmd_type == POOL_TRIM_START || cmd_type == POOL_TRIM_SUSPEND)) { return (SET_ERROR(EINVAL)); } nvlist_t *vdev_guids; if (nvlist_lookup_nvlist(innvl, ZPOOL_TRIM_VDEVS, &vdev_guids) != 0) return (SET_ERROR(EINVAL)); for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { uint64_t vdev_guid; if (nvpair_value_uint64(pair, &vdev_guid) != 0) { return (SET_ERROR(EINVAL)); } } /* Optional, defaults to maximum rate when not provided */ uint64_t rate; if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_RATE, &rate) != 0) rate = 0; /* Optional, defaults to standard TRIM when not provided */ boolean_t secure; if (nvlist_lookup_boolean_value(innvl, ZPOOL_TRIM_SECURE, &secure) != 0) { secure = B_FALSE; } spa_t *spa; int error = spa_open(poolname, &spa, FTAG); if (error != 0) return (error); nvlist_t *vdev_errlist = fnvlist_alloc(); int total_errors = spa_vdev_trim(spa, vdev_guids, cmd_type, rate, !!zfs_trim_metaslab_skip, secure, vdev_errlist); if (fnvlist_size(vdev_errlist) > 0) fnvlist_add_nvlist(outnvl, ZPOOL_TRIM_VDEVS, vdev_errlist); fnvlist_free(vdev_errlist); spa_close(spa, FTAG); return (total_errors > 0 ? EINVAL : 0); } /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the * returned value "waited" is false. If there is activity in progress, and no * tag is passed in, the ioctl blocks until all activity of that type is * complete, and then returns with "waited" set to true. * * If a tag is provided, it identifies a particular instance of an activity to * wait for. Currently, this is only valid for use with 'initialize', because * that is the only activity for which there can be multiple instances running * concurrently. In the case of 'initialize', the tag corresponds to the guid of * the vdev on which to wait. * * If a thread waiting in the ioctl receives a signal, the call will return * immediately, and the return value will be EINTR. * * innvl: { * "wait_activity" -> int32_t * (optional) "wait_tag" -> uint64_t * } * * outnvl: "waited" -> boolean_t */ static const zfs_ioc_key_t zfs_keys_pool_wait[] = { {ZPOOL_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, {ZPOOL_WAIT_TAG, DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_wait(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int32_t activity; uint64_t tag; boolean_t waited; int error; if (nvlist_lookup_int32(innvl, ZPOOL_WAIT_ACTIVITY, &activity) != 0) return (EINVAL); if (nvlist_lookup_uint64(innvl, ZPOOL_WAIT_TAG, &tag) == 0) error = spa_wait_tag(name, activity, tag, &waited); else error = spa_wait(name, activity, &waited); if (error == 0) fnvlist_add_boolean_value(outnvl, ZPOOL_WAIT_WAITED, waited); return (error); } /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the * returned value "waited" is false. If there is activity in progress, and no * tag is passed in, the ioctl blocks until all activity of that type is * complete, and then returns with "waited" set to true. * * If a thread waiting in the ioctl receives a signal, the call will return * immediately, and the return value will be EINTR. * * innvl: { * "wait_activity" -> int32_t * } * * outnvl: "waited" -> boolean_t */ static const zfs_ioc_key_t zfs_keys_fs_wait[] = { {ZFS_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, }; static int zfs_ioc_wait_fs(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int32_t activity; boolean_t waited = B_FALSE; int error; dsl_pool_t *dp; dsl_dir_t *dd; dsl_dataset_t *ds; if (nvlist_lookup_int32(innvl, ZFS_WAIT_ACTIVITY, &activity) != 0) return (SET_ERROR(EINVAL)); if (activity >= ZFS_WAIT_NUM_ACTIVITIES || activity < 0) return (SET_ERROR(EINVAL)); if ((error = dsl_pool_hold(name, FTAG, &dp)) != 0) return (error); if ((error = dsl_dataset_hold(dp, name, FTAG, &ds)) != 0) { dsl_pool_rele(dp, FTAG); return (error); } dd = ds->ds_dir; mutex_enter(&dd->dd_activity_lock); dd->dd_activity_waiters++; /* * We get a long-hold here so that the dsl_dataset_t and dsl_dir_t * aren't evicted while we're waiting. Normally this is prevented by * holding the pool, but we can't do that while we're waiting since * that would prevent TXGs from syncing out. Some of the functionality * of long-holds (e.g. preventing deletion) is unnecessary for this * case, since we would cancel the waiters before proceeding with a * deletion. An alternative mechanism for keeping the dataset around * could be developed but this is simpler. */ dsl_dataset_long_hold(ds, FTAG); dsl_pool_rele(dp, FTAG); error = dsl_dir_wait(dd, ds, activity, &waited); dsl_dataset_long_rele(ds, FTAG); dd->dd_activity_waiters--; if (dd->dd_activity_waiters == 0) cv_signal(&dd->dd_activity_cv); mutex_exit(&dd->dd_activity_lock); dsl_dataset_rele(ds, FTAG); if (error == 0) fnvlist_add_boolean_value(outnvl, ZFS_WAIT_WAITED, waited); return (error); } /* * fsname is name of dataset to rollback (to most recent snapshot) * * innvl may contain name of expected target snapshot * * outnvl: "target" -> name of most recent snapshot * } */ static const zfs_ioc_key_t zfs_keys_rollback[] = { {"target", DATA_TYPE_STRING, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { zfsvfs_t *zfsvfs; zvol_state_handle_t *zv; char *target = NULL; int error; (void) nvlist_lookup_string(innvl, "target", &target); if (target != NULL) { const char *cp = strchr(target, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); } if (getzfsvfs(fsname, &zfsvfs) == 0) { dsl_dataset_t *ds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { int resume_err; error = dsl_dataset_rollback(fsname, target, zfsvfs, outnvl); resume_err = zfs_resume_fs(zfsvfs, ds); error = error ? error : resume_err; } zfs_vfs_rele(zfsvfs); } else if ((zv = zvol_suspend(fsname)) != NULL) { error = dsl_dataset_rollback(fsname, target, zvol_tag(zv), outnvl); zvol_resume(zv); } else { error = dsl_dataset_rollback(fsname, target, NULL, outnvl); } return (error); } static int recursive_unmount(const char *fsname, void *arg) { const char *snapname = arg; char *fullname; fullname = kmem_asprintf("%s@%s", fsname, snapname); zfs_unmount_snap(fullname); kmem_strfree(fullname); return (0); } /* * * snapname is the snapshot to redact. * innvl: { * "bookname" -> (string) * shortname of the redaction bookmark to generate * "snapnv" -> (nvlist, values ignored) * snapshots to redact snapname with respect to * } * * outnvl is unused */ /* ARGSUSED */ static const zfs_ioc_key_t zfs_keys_redact[] = { {"bookname", DATA_TYPE_STRING, 0}, {"snapnv", DATA_TYPE_NVLIST, 0}, }; static int zfs_ioc_redact(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { nvlist_t *redactnvl = NULL; char *redactbook = NULL; if (nvlist_lookup_nvlist(innvl, "snapnv", &redactnvl) != 0) return (SET_ERROR(EINVAL)); if (fnvlist_num_pairs(redactnvl) == 0) return (SET_ERROR(ENXIO)); if (nvlist_lookup_string(innvl, "bookname", &redactbook) != 0) return (SET_ERROR(EINVAL)); return (dmu_redact_snap(snapname, redactnvl, redactbook)); } /* * inputs: * zc_name old name of dataset * zc_value new name of dataset * zc_cookie recursive flag (only valid for snapshots) * * outputs: none */ static int zfs_ioc_rename(zfs_cmd_t *zc) { objset_t *os; dmu_objset_type_t ost; boolean_t recursive = zc->zc_cookie & 1; + boolean_t nounmount = !!(zc->zc_cookie & 2); char *at; int err; /* "zfs rename" from and to ...%recv datasets should both fail */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); err = dmu_objset_hold(zc->zc_name, FTAG, &os); if (err != 0) return (err); ost = dmu_objset_type(os); dmu_objset_rele(os, FTAG); at = strchr(zc->zc_name, '@'); if (at != NULL) { /* snaps must be in same fs */ int error; if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) return (SET_ERROR(EXDEV)); *at = '\0'; - if (ost == DMU_OST_ZFS) { + if (ost == DMU_OST_ZFS && !nounmount) { error = dmu_objset_find(zc->zc_name, recursive_unmount, at + 1, recursive ? DS_FIND_CHILDREN : 0); if (error != 0) { *at = '@'; return (error); } } error = dsl_dataset_rename_snapshot(zc->zc_name, at + 1, strchr(zc->zc_value, '@') + 1, recursive); *at = '@'; return (error); } else { return (dsl_dir_rename(zc->zc_name, zc->zc_value)); } } static int zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) { const char *propname = nvpair_name(pair); boolean_t issnap = (strchr(dsname, '@') != NULL); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval, compval; int err; if (prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if ((err = zfs_secpolicy_write_perms(dsname, ZFS_DELEG_PERM_USERPROP, cr))) return (err); return (0); } if (!issnap && zfs_prop_userquota(propname)) { const char *perm = NULL; const char *uq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; const char *gq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; const char *uiq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA]; const char *giq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA]; const char *pq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA]; const char *piq_prefix = zfs_userquota_prop_prefixes[\ ZFS_PROP_PROJECTOBJQUOTA]; if (strncmp(propname, uq_prefix, strlen(uq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USERQUOTA; } else if (strncmp(propname, uiq_prefix, strlen(uiq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USEROBJQUOTA; } else if (strncmp(propname, gq_prefix, strlen(gq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPQUOTA; } else if (strncmp(propname, giq_prefix, strlen(giq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPOBJQUOTA; } else if (strncmp(propname, pq_prefix, strlen(pq_prefix)) == 0) { perm = ZFS_DELEG_PERM_PROJECTQUOTA; } else if (strncmp(propname, piq_prefix, strlen(piq_prefix)) == 0) { perm = ZFS_DELEG_PERM_PROJECTOBJQUOTA; } else { /* {USER|GROUP|PROJECT}USED are read-only */ return (SET_ERROR(EINVAL)); } if ((err = zfs_secpolicy_write_perms(dsname, perm, cr))) return (err); return (0); } return (SET_ERROR(EINVAL)); } if (issnap) return (SET_ERROR(EINVAL)); if (nvpair_type(pair) == DATA_TYPE_NVLIST) { /* * dsl_prop_get_all_impl() returns properties in this * format. */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* * Check that this value is valid for this pool version */ switch (prop) { case ZFS_PROP_COMPRESSION: /* * If the user specified gzip compression, make sure * the SPA supports it. We ignore any errors here since * we'll catch them later. */ if (nvpair_value_uint64(pair, &intval) == 0) { compval = ZIO_COMPRESS_ALGO(intval); if (compval >= ZIO_COMPRESS_GZIP_1 && compval <= ZIO_COMPRESS_GZIP_9 && zfs_earlier_version(dsname, SPA_VERSION_GZIP_COMPRESSION)) { return (SET_ERROR(ENOTSUP)); } if (compval == ZIO_COMPRESS_ZLE && zfs_earlier_version(dsname, SPA_VERSION_ZLE_COMPRESSION)) return (SET_ERROR(ENOTSUP)); if (compval == ZIO_COMPRESS_LZ4) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } if (compval == ZIO_COMPRESS_ZSTD) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } } break; case ZFS_PROP_COPIES: if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_VOLBLOCKSIZE: case ZFS_PROP_RECORDSIZE: /* Record sizes above 128k need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (intval > zfs_max_recordsize || intval > SPA_MAXBLOCKSIZE) return (SET_ERROR(ERANGE)); if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_DNODESIZE: /* Dnode sizes above 512 need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval != ZFS_DNSIZE_LEGACY) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_SPECIAL_SMALL_BLOCKS: /* * This property could require the allocation classes * feature to be active for setting, however we allow * it so that tests of settable properties succeed. * The CLI will issue a warning in this case. */ break; case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_ACLINHERIT: if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { if (intval == ZFS_ACL_PASSTHROUGH_X && zfs_earlier_version(dsname, SPA_VERSION_PASSTHROUGH_X)) return (SET_ERROR(ENOTSUP)); } break; case ZFS_PROP_CHECKSUM: case ZFS_PROP_DEDUP: { spa_feature_t feature; spa_t *spa; int err; /* dedup feature version checks */ if (prop == ZFS_PROP_DEDUP && zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) return (SET_ERROR(ENOTSUP)); if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { /* check prop value is enabled in features */ feature = zio_checksum_to_feature( intval & ZIO_CHECKSUM_MASK); if (feature == SPA_FEATURE_NONE) break; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, feature)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; } default: break; } return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); } /* * Removes properties from the given props list that fail permission checks * needed to clear them and to restore them in case of a receive error. For each * property, make sure we have both set and inherit permissions. * * Returns the first error encountered if any permission checks fail. If the * caller provides a non-NULL errlist, it also gives the complete list of names * of all the properties that failed a permission check along with the * corresponding error numbers. The caller is responsible for freeing the * returned errlist. * * If every property checks out successfully, zero is returned and the list * pointed at by errlist is NULL. */ static int zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist) { zfs_cmd_t *zc; nvpair_t *pair, *next_pair; nvlist_t *errors; int err, rv = 0; if (props == NULL) return (0); VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strlcpy(zc->zc_name, dataset, sizeof (zc->zc_name)); pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { next_pair = nvlist_next_nvpair(props, pair); (void) strlcpy(zc->zc_value, nvpair_name(pair), sizeof (zc->zc_value)); if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { VERIFY(nvlist_remove_nvpair(props, pair) == 0); VERIFY(nvlist_add_int32(errors, zc->zc_value, err) == 0); } pair = next_pair; } kmem_free(zc, sizeof (zfs_cmd_t)); if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { nvlist_free(errors); errors = NULL; } else { VERIFY(nvpair_value_int32(pair, &rv) == 0); } if (errlist == NULL) nvlist_free(errors); else *errlist = errors; return (rv); } static boolean_t propval_equals(nvpair_t *p1, nvpair_t *p2) { if (nvpair_type(p1) == DATA_TYPE_NVLIST) { /* dsl_prop_get_all_impl() format */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p1) == 0); } if (nvpair_type(p2) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p2) == 0); } if (nvpair_type(p1) != nvpair_type(p2)) return (B_FALSE); if (nvpair_type(p1) == DATA_TYPE_STRING) { char *valstr1, *valstr2; VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0); VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0); return (strcmp(valstr1, valstr2) == 0); } else { uint64_t intval1, intval2; VERIFY(nvpair_value_uint64(p1, &intval1) == 0); VERIFY(nvpair_value_uint64(p2, &intval2) == 0); return (intval1 == intval2); } } /* * Remove properties from props if they are not going to change (as determined * by comparison with origprops). Remove them from origprops as well, since we * do not need to clear or restore properties that won't change. */ static void props_reduce(nvlist_t *props, nvlist_t *origprops) { nvpair_t *pair, *next_pair; if (origprops == NULL) return; /* all props need to be received */ pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { const char *propname = nvpair_name(pair); nvpair_t *match; next_pair = nvlist_next_nvpair(props, pair); if ((nvlist_lookup_nvpair(origprops, propname, &match) != 0) || !propval_equals(pair, match)) goto next; /* need to set received value */ /* don't clear the existing received value */ (void) nvlist_remove_nvpair(origprops, match); /* don't bother receiving the property */ (void) nvlist_remove_nvpair(props, pair); next: pair = next_pair; } } /* * Extract properties that cannot be set PRIOR to the receipt of a dataset. * For example, refquota cannot be set until after the receipt of a dataset, * because in replication streams, an older/earlier snapshot may exceed the * refquota. We want to receive the older/earlier snapshot, but setting * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent * the older/earlier snapshot from being received (with EDQUOT). * * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario. * * libzfs will need to be judicious handling errors encountered by props * extracted by this function. */ static nvlist_t * extract_delay_props(nvlist_t *props) { nvlist_t *delayprops; nvpair_t *nvp, *tmp; static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, ZFS_PROP_KEYLOCATION, 0 }; int i; VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL; nvp = nvlist_next_nvpair(props, nvp)) { /* * strcmp() is safe because zfs_prop_to_name() always returns * a bounded string. */ for (i = 0; delayable[i] != 0; i++) { if (strcmp(zfs_prop_to_name(delayable[i]), nvpair_name(nvp)) == 0) { break; } } if (delayable[i] != 0) { tmp = nvlist_prev_nvpair(props, nvp); VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0); VERIFY(nvlist_remove_nvpair(props, nvp) == 0); nvp = tmp; } } if (nvlist_empty(delayprops)) { nvlist_free(delayprops); delayprops = NULL; } return (delayprops); } static void zfs_allow_log_destroy(void *arg) { char *poolname = arg; if (poolname != NULL) kmem_strfree(poolname); } #ifdef ZFS_DEBUG static boolean_t zfs_ioc_recv_inject_err; #endif /* * nvlist 'errors' is always allocated. It will contain descriptions of * encountered errors, if any. It's the callers responsibility to free. */ static int zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, nvlist_t *localprops, nvlist_t *hidden_args, boolean_t force, boolean_t resumable, int input_fd, dmu_replay_record_t *begin_record, uint64_t *read_bytes, uint64_t *errflags, nvlist_t **errors) { dmu_recv_cookie_t drc; int error = 0; int props_error = 0; offset_t off, noff; nvlist_t *local_delayprops = NULL; nvlist_t *recv_delayprops = NULL; nvlist_t *origprops = NULL; /* existing properties */ nvlist_t *origrecvd = NULL; /* existing received properties */ boolean_t first_recvd_props = B_FALSE; boolean_t tofs_was_redacted; zfs_file_t *input_fp; *read_bytes = 0; *errflags = 0; *errors = fnvlist_alloc(); off = 0; if ((error = zfs_file_get(input_fd, &input_fp))) return (error); noff = off = zfs_file_off(input_fp); error = dmu_recv_begin(tofs, tosnap, begin_record, force, resumable, localprops, hidden_args, origin, &drc, input_fp, &off); if (error != 0) goto out; tofs_was_redacted = dsl_get_redacted(drc.drc_ds); /* * Set properties before we receive the stream so that they are applied * to the new data. Note that we must call dmu_recv_stream() if * dmu_recv_begin() succeeds. */ if (recvprops != NULL && !drc.drc_newfs) { if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= SPA_VERSION_RECVD_PROPS && !dsl_prop_get_hasrecvd(tofs)) first_recvd_props = B_TRUE; /* * If new received properties are supplied, they are to * completely replace the existing received properties, * so stash away the existing ones. */ if (dsl_prop_get_received(tofs, &origrecvd) == 0) { nvlist_t *errlist = NULL; /* * Don't bother writing a property if its value won't * change (and avoid the unnecessary security checks). * * The first receive after SPA_VERSION_RECVD_PROPS is a * special case where we blow away all local properties * regardless. */ if (!first_recvd_props) props_reduce(recvprops, origrecvd); if (zfs_check_clearable(tofs, origrecvd, &errlist) != 0) (void) nvlist_merge(*errors, errlist, 0); nvlist_free(errlist); if (clear_received_props(tofs, origrecvd, first_recvd_props ? NULL : recvprops) != 0) *errflags |= ZPROP_ERR_NOCLEAR; } else { *errflags |= ZPROP_ERR_NOCLEAR; } } /* * Stash away existing properties so we can restore them on error unless * we're doing the first receive after SPA_VERSION_RECVD_PROPS, in which * case "origrecvd" will take care of that. */ if (localprops != NULL && !drc.drc_newfs && !first_recvd_props) { objset_t *os; if (dmu_objset_hold(tofs, FTAG, &os) == 0) { if (dsl_prop_get_all(os, &origprops) != 0) { *errflags |= ZPROP_ERR_NOCLEAR; } dmu_objset_rele(os, FTAG); } else { *errflags |= ZPROP_ERR_NOCLEAR; } } if (recvprops != NULL) { props_error = dsl_prop_set_hasrecvd(tofs); if (props_error == 0) { recv_delayprops = extract_delay_props(recvprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, recvprops, *errors); } } if (localprops != NULL) { nvlist_t *oprops = fnvlist_alloc(); nvlist_t *xprops = fnvlist_alloc(); nvpair_t *nvp = NULL; while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) { if (nvpair_type(nvp) == DATA_TYPE_BOOLEAN) { /* -x property */ const char *name = nvpair_name(nvp); zfs_prop_t prop = zfs_name_to_prop(name); if (prop != ZPROP_INVAL) { if (!zfs_prop_inheritable(prop)) continue; } else if (!zfs_prop_user(name)) continue; fnvlist_add_boolean(xprops, name); } else { /* -o property=value */ fnvlist_add_nvpair(oprops, nvp); } } local_delayprops = extract_delay_props(oprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, oprops, *errors); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, xprops, *errors); nvlist_free(oprops); nvlist_free(xprops); } error = dmu_recv_stream(&drc, &off); if (error == 0) { zfsvfs_t *zfsvfs = NULL; zvol_state_handle_t *zv = NULL; if (getzfsvfs(tofs, &zfsvfs) == 0) { /* online recv */ dsl_dataset_t *ds; int end_err; boolean_t stream_is_redacted = DMU_GET_FEATUREFLAGS( begin_record->drr_u.drr_begin. drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); /* * If the suspend fails, then the recv_end will * likely also fail, and clean up after itself. */ end_err = dmu_recv_end(&drc, zfsvfs); /* * If the dataset was not redacted, but we received a * redacted stream onto it, we need to unmount the * dataset. Otherwise, resume the filesystem. */ if (error == 0 && !drc.drc_newfs && stream_is_redacted && !tofs_was_redacted) { error = zfs_end_fs(zfsvfs, ds); } else if (error == 0) { error = zfs_resume_fs(zfsvfs, ds); } error = error ? error : end_err; zfs_vfs_rele(zfsvfs); } else if ((zv = zvol_suspend(tofs)) != NULL) { error = dmu_recv_end(&drc, zvol_tag(zv)); zvol_resume(zv); } else { error = dmu_recv_end(&drc, NULL); } /* Set delayed properties now, after we're done receiving. */ if (recv_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, recv_delayprops, *errors); } if (local_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, local_delayprops, *errors); } } /* * Merge delayed props back in with initial props, in case * we're DEBUG and zfs_ioc_recv_inject_err is set (which means * we have to make sure clear_received_props() includes * the delayed properties). * * Since zfs_ioc_recv_inject_err is only in DEBUG kernels, * using ASSERT() will be just like a VERIFY. */ if (recv_delayprops != NULL) { ASSERT(nvlist_merge(recvprops, recv_delayprops, 0) == 0); nvlist_free(recv_delayprops); } if (local_delayprops != NULL) { ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0); nvlist_free(local_delayprops); } *read_bytes = off - noff; #ifdef ZFS_DEBUG if (zfs_ioc_recv_inject_err) { zfs_ioc_recv_inject_err = B_FALSE; error = 1; } #endif /* * On error, restore the original props. */ if (error != 0 && recvprops != NULL && !drc.drc_newfs) { if (clear_received_props(tofs, recvprops, NULL) != 0) { /* * We failed to clear the received properties. * Since we may have left a $recvd value on the * system, we can't clear the $hasrecvd flag. */ *errflags |= ZPROP_ERR_NORESTORE; } else if (first_recvd_props) { dsl_prop_unset_hasrecvd(tofs); } if (origrecvd == NULL && !drc.drc_newfs) { /* We failed to stash the original properties. */ *errflags |= ZPROP_ERR_NORESTORE; } /* * dsl_props_set() will not convert RECEIVED to LOCAL on or * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL * explicitly if we're restoring local properties cleared in the * first new-style receive. */ if (origrecvd != NULL && zfs_set_prop_nvlist(tofs, (first_recvd_props ? ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), origrecvd, NULL) != 0) { /* * We stashed the original properties but failed to * restore them. */ *errflags |= ZPROP_ERR_NORESTORE; } } if (error != 0 && localprops != NULL && !drc.drc_newfs && !first_recvd_props) { nvlist_t *setprops; nvlist_t *inheritprops; nvpair_t *nvp; if (origprops == NULL) { /* We failed to stash the original properties. */ *errflags |= ZPROP_ERR_NORESTORE; goto out; } /* Restore original props */ setprops = fnvlist_alloc(); inheritprops = fnvlist_alloc(); nvp = NULL; while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) { const char *name = nvpair_name(nvp); const char *source; nvlist_t *attrs; if (!nvlist_exists(origprops, name)) { /* * Property was not present or was explicitly * inherited before the receive, restore this. */ fnvlist_add_boolean(inheritprops, name); continue; } attrs = fnvlist_lookup_nvlist(origprops, name); source = fnvlist_lookup_string(attrs, ZPROP_SOURCE); /* Skip received properties */ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) continue; if (strcmp(source, tofs) == 0) { /* Property was locally set */ fnvlist_add_nvlist(setprops, name, attrs); } else { /* Property was implicitly inherited */ fnvlist_add_boolean(inheritprops, name); } } if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, setprops, NULL) != 0) *errflags |= ZPROP_ERR_NORESTORE; if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, inheritprops, NULL) != 0) *errflags |= ZPROP_ERR_NORESTORE; nvlist_free(setprops); nvlist_free(inheritprops); } out: zfs_file_put(input_fd); nvlist_free(origrecvd); nvlist_free(origprops); if (error == 0) error = props_error; return (error); } /* * inputs: * zc_name name of containing filesystem (unused) * zc_nvlist_src{_size} nvlist of properties to apply * zc_nvlist_conf{_size} nvlist of properties to exclude * (DATA_TYPE_BOOLEAN) and override (everything else) * zc_value name of snapshot to create * zc_string name of clone origin (if DRR_FLAG_CLONE) * zc_cookie file descriptor to recv from * zc_begin_record the BEGIN record of the stream (not byteswapped) * zc_guid force flag * * outputs: * zc_cookie number of bytes read * zc_obj zprop_errflags_t * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_recv(zfs_cmd_t *zc) { dmu_replay_record_t begin_record; nvlist_t *errors = NULL; nvlist_t *recvdprops = NULL; nvlist_t *localprops = NULL; char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; int error = 0; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); (void) strlcpy(tofs, zc->zc_value, sizeof (tofs)); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &recvdprops)) != 0) return (error); if (zc->zc_nvlist_conf != 0 && (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &localprops)) != 0) return (error); if (zc->zc_string[0]) origin = zc->zc_string; begin_record.drr_type = DRR_BEGIN; begin_record.drr_payloadlen = 0; begin_record.drr_u.drr_begin = zc->zc_begin_record; error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops, NULL, zc->zc_guid, B_FALSE, zc->zc_cookie, &begin_record, &zc->zc_cookie, &zc->zc_obj, &errors); nvlist_free(recvdprops); nvlist_free(localprops); /* * Now that all props, initial and delayed, are set, report the prop * errors to the caller. */ if (zc->zc_nvlist_dst_size != 0 && errors != NULL && (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || put_nvlist(zc, errors) != 0)) { /* * Caller made zc->zc_nvlist_dst less than the minimum expected * size or supplied an invalid address. */ error = SET_ERROR(EINVAL); } nvlist_free(errors); return (error); } /* * innvl: { * "snapname" -> full name of the snapshot to create * (optional) "props" -> received properties to set (nvlist) * (optional) "localprops" -> override and exclude properties (nvlist) * (optional) "origin" -> name of clone origin (DRR_FLAG_CLONE) * "begin_record" -> non-byteswapped dmu_replay_record_t * "input_fd" -> file descriptor to read stream from (int32) * (optional) "force" -> force flag (value ignored) * (optional) "resumable" -> resumable flag (value ignored) * (optional) "cleanup_fd" -> unused * (optional) "action_handle" -> unused * (optional) "hidden_args" -> { "wkeydata" -> value } * } * * outnvl: { * "read_bytes" -> number of bytes read * "error_flags" -> zprop_errflags_t * "errors" -> error for each unapplied received property (nvlist) * } */ static const zfs_ioc_key_t zfs_keys_recv_new[] = { {"snapname", DATA_TYPE_STRING, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"localprops", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"origin", DATA_TYPE_STRING, ZK_OPTIONAL}, {"begin_record", DATA_TYPE_BYTE_ARRAY, 0}, {"input_fd", DATA_TYPE_INT32, 0}, {"force", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resumable", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"action_handle", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { dmu_replay_record_t *begin_record; uint_t begin_record_size; nvlist_t *errors = NULL; nvlist_t *recvprops = NULL; nvlist_t *localprops = NULL; nvlist_t *hidden_args = NULL; char *snapname; char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; boolean_t force; boolean_t resumable; uint64_t read_bytes = 0; uint64_t errflags = 0; int input_fd = -1; int error; snapname = fnvlist_lookup_string(innvl, "snapname"); if (dataset_namecheck(snapname, NULL, NULL) != 0 || strchr(snapname, '@') == NULL || strchr(snapname, '%')) return (SET_ERROR(EINVAL)); (void) strlcpy(tofs, snapname, sizeof (tofs)); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; error = nvlist_lookup_string(innvl, "origin", &origin); if (error && error != ENOENT) return (error); error = nvlist_lookup_byte_array(innvl, "begin_record", (uchar_t **)&begin_record, &begin_record_size); if (error != 0 || begin_record_size != sizeof (*begin_record)) return (SET_ERROR(EINVAL)); input_fd = fnvlist_lookup_int32(innvl, "input_fd"); force = nvlist_exists(innvl, "force"); resumable = nvlist_exists(innvl, "resumable"); /* we still use "props" here for backwards compatibility */ error = nvlist_lookup_nvlist(innvl, "props", &recvprops); if (error && error != ENOENT) return (error); error = nvlist_lookup_nvlist(innvl, "localprops", &localprops); if (error && error != ENOENT) return (error); error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); if (error && error != ENOENT) return (error); error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops, hidden_args, force, resumable, input_fd, begin_record, &read_bytes, &errflags, &errors); fnvlist_add_uint64(outnvl, "read_bytes", read_bytes); fnvlist_add_uint64(outnvl, "error_flags", errflags); fnvlist_add_nvlist(outnvl, "errors", errors); nvlist_free(errors); nvlist_free(recvprops); nvlist_free(localprops); return (error); } typedef struct dump_bytes_io { zfs_file_t *dbi_fp; caddr_t dbi_buf; int dbi_len; int dbi_err; } dump_bytes_io_t; static void dump_bytes_cb(void *arg) { dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg; zfs_file_t *fp; caddr_t buf; fp = dbi->dbi_fp; buf = dbi->dbi_buf; dbi->dbi_err = zfs_file_write(fp, buf, dbi->dbi_len, NULL); } static int dump_bytes(objset_t *os, void *buf, int len, void *arg) { dump_bytes_io_t dbi; dbi.dbi_fp = arg; dbi.dbi_buf = buf; dbi.dbi_len = len; #if defined(HAVE_LARGE_STACKS) dump_bytes_cb(&dbi); #else /* * The vn_rdwr() call is performed in a taskq to ensure that there is * always enough stack space to write safely to the target filesystem. * The ZIO_TYPE_FREE threads are used because there can be a lot of * them and they are used in vdev_file.c for a similar purpose. */ spa_taskq_dispatch_sync(dmu_objset_spa(os), ZIO_TYPE_FREE, ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP); #endif /* HAVE_LARGE_STACKS */ return (dbi.dbi_err); } /* * inputs: * zc_name name of snapshot to send * zc_cookie file descriptor to send stream to * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) * zc_sendobj objsetid of snapshot to send * zc_fromobj objsetid of incremental fromsnap (may be zero) * zc_guid if set, estimate size of stream only. zc_cookie is ignored. * output size in zc_objset_type. * zc_flags lzc_send_flags * * outputs: * zc_objset_type estimated size, if zc_guid is set * * NOTE: This is no longer the preferred interface, any new functionality * should be added to zfs_ioc_send_new() instead. */ static int zfs_ioc_send(zfs_cmd_t *zc) { int error; offset_t off; boolean_t estimate = (zc->zc_guid != 0); boolean_t embedok = (zc->zc_flags & 0x1); boolean_t large_block_ok = (zc->zc_flags & 0x2); boolean_t compressok = (zc->zc_flags & 0x4); boolean_t rawok = (zc->zc_flags & 0x8); boolean_t savedok = (zc->zc_flags & 0x10); if (zc->zc_obj != 0) { dsl_pool_t *dp; dsl_dataset_t *tosnap; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (dsl_dir_is_clone(tosnap->ds_dir)) zc->zc_fromobj = dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj; dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } if (estimate) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (zc->zc_fromobj != 0) { error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } } error = dmu_send_estimate_fast(tosnap, fromsnap, NULL, compressok || rawok, savedok, &zc->zc_objset_type); if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } else { zfs_file_t *fp; dmu_send_outparams_t out = {0}; if ((error = zfs_file_get(zc->zc_cookie, &fp))) return (error); off = zfs_file_off(fp); out.dso_outfunc = dump_bytes; out.dso_arg = fp; out.dso_dryrun = B_FALSE; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, zc->zc_fromobj, embedok, large_block_ok, compressok, rawok, savedok, zc->zc_cookie, &off, &out); zfs_file_put(zc->zc_cookie); } return (error); } /* * inputs: * zc_name name of snapshot on which to report progress * zc_cookie file descriptor of send stream * * outputs: * zc_cookie number of bytes written in send stream thus far * zc_objset_type logical size of data traversed by send thus far */ static int zfs_ioc_send_progress(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds; dmu_sendstatus_t *dsp = NULL; int error; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } mutex_enter(&ds->ds_sendstream_lock); /* * Iterate over all the send streams currently active on this dataset. * If there's one which matches the specified file descriptor _and_ the * stream was started by the current process, return the progress of * that stream. */ for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; dsp = list_next(&ds->ds_sendstreams, dsp)) { if (dsp->dss_outfd == zc->zc_cookie && zfs_proc_is_caller(dsp->dss_proc)) break; } if (dsp != NULL) { zc->zc_cookie = atomic_cas_64((volatile uint64_t *)dsp->dss_off, 0, 0); /* This is the closest thing we have to atomic_read_64. */ zc->zc_objset_type = atomic_cas_64(&dsp->dss_blocks, 0, 0); } else { error = SET_ERROR(ENOENT); } mutex_exit(&ds->ds_sendstream_lock); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static int zfs_ioc_inject_fault(zfs_cmd_t *zc) { int id, error; error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, &zc->zc_inject_record); if (error == 0) zc->zc_guid = (uint64_t)id; return (error); } static int zfs_ioc_clear_fault(zfs_cmd_t *zc) { return (zio_clear_fault((int)zc->zc_guid)); } static int zfs_ioc_inject_list_next(zfs_cmd_t *zc) { int id = (int)zc->zc_guid; int error; error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), &zc->zc_inject_record); zc->zc_guid = id; return (error); } static int zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; size_t count = (size_t)zc->zc_nvlist_dst_size; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, &count); if (error == 0) zc->zc_nvlist_dst_size = count; else zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); spa_close(spa, FTAG); return (error); } static int zfs_ioc_clear(zfs_cmd_t *zc) { spa_t *spa; vdev_t *vd; int error; /* * On zpool clear we also fix up missing slogs */ mutex_enter(&spa_namespace_lock); spa = spa_lookup(zc->zc_name); if (spa == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EIO)); } if (spa_get_log_state(spa) == SPA_LOG_MISSING) { /* we need to let spa_open/spa_load clear the chains */ spa_set_log_state(spa, SPA_LOG_CLEAR); } spa->spa_last_open_failed = 0; mutex_exit(&spa_namespace_lock); if (zc->zc_cookie & ZPOOL_NO_REWIND) { error = spa_open(zc->zc_name, &spa, FTAG); } else { nvlist_t *policy; nvlist_t *config = NULL; if (zc->zc_nvlist_src == 0) return (SET_ERROR(EINVAL)); if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { error = spa_open_rewind(zc->zc_name, &spa, FTAG, policy, &config); if (config != NULL) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; nvlist_free(config); } nvlist_free(policy); } } if (error != 0) return (error); /* * If multihost is enabled, resuming I/O is unsafe as another * host may have imported the pool. */ if (spa_multihost(spa) && spa_suspended(spa)) return (SET_ERROR(EINVAL)); spa_vdev_state_enter(spa, SCL_NONE); if (zc->zc_guid == 0) { vd = NULL; } else { vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); if (vd == NULL) { error = SET_ERROR(ENODEV); (void) spa_vdev_state_exit(spa, NULL, error); spa_close(spa, FTAG); return (error); } } vdev_clear(spa, vd); (void) spa_vdev_state_exit(spa, spa_suspended(spa) ? NULL : spa->spa_root_vdev, 0); /* * Resume any suspended I/Os. */ if (zio_resume(spa) != 0) error = SET_ERROR(EIO); spa_close(spa, FTAG); return (error); } /* * Reopen all the vdevs associated with the pool. * * innvl: { * "scrub_restart" -> when true and scrub is running, allow to restart * scrub as the side effect of the reopen (boolean). * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_pool_reopen[] = { {"scrub_restart", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; boolean_t rc, scrub_restart = B_TRUE; if (innvl) { error = nvlist_lookup_boolean_value(innvl, "scrub_restart", &rc); if (error == 0) scrub_restart = rc; } error = spa_open(pool, &spa, FTAG); if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); /* * If the scrub_restart flag is B_FALSE and a scrub is already * in progress then set spa_scrub_reopen flag to B_TRUE so that * we don't restart the scrub as a side effect of the reopen. * Otherwise, let vdev_open() decided if a resilver is required. */ spa->spa_scrub_reopen = (!scrub_restart && dsl_scan_scrubbing(spa->spa_dsl_pool)); vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (0); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_string name of conflicting snapshot, if there is one */ static int zfs_ioc_promote(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds, *ods; char origin[ZFS_MAX_DATASET_NAME_LEN]; char *cp; int error; zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || strchr(zc->zc_name, '%')) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dir_is_clone(ds->ds_dir)) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods); if (error != 0) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ods, origin); dsl_dataset_rele(ods, FTAG); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); /* * We don't need to unmount *all* the origin fs's snapshots, but * it's easier. */ cp = strchr(origin, '@'); if (cp) *cp = '\0'; (void) dmu_objset_find(origin, zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); } /* * Retrieve a single {user|group|project}{used|quota}@... property. * * inputs: * zc_name name of filesystem * zc_objset_type zfs_userquota_prop_t * zc_value domain name (eg. "S-1-234-567-89") * zc_guid RID/UID/GID * * outputs: * zc_cookie property value */ static int zfs_ioc_userspace_one(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int error; if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); error = zfs_userspace_one(zfsvfs, zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_objset_type zfs_userquota_prop_t * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) * * outputs: * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) * zc_cookie zap cursor */ static int zfs_ioc_userspace_many(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int bufsize = zc->zc_nvlist_dst_size; if (bufsize <= 0) return (SET_ERROR(ENOMEM)); int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); void *buf = vmem_alloc(bufsize, KM_SLEEP); error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, buf, &zc->zc_nvlist_dst_size); if (error == 0) { error = xcopyout(buf, (void *)(uintptr_t)zc->zc_nvlist_dst, zc->zc_nvlist_dst_size); } vmem_free(buf, bufsize); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) { objset_t *os; int error = 0; zfsvfs_t *zfsvfs; if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { if (!dmu_objset_userused_enabled(zfsvfs->z_os)) { /* * If userused is not enabled, it may be because the * objset needs to be closed & reopened (to grow the * objset_phys_t). Suspend/resume the fs will do that. */ dsl_dataset_t *ds, *newds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { dmu_objset_refresh_ownership(ds, &newds, B_TRUE, zfsvfs); error = zfs_resume_fs(zfsvfs, newds); } } if (error == 0) error = dmu_objset_userspace_upgrade(zfsvfs->z_os); zfs_vfs_rele(zfsvfs); } else { /* XXX kind of reading contents without owning */ error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); error = dmu_objset_userspace_upgrade(os); dmu_objset_rele_flags(os, B_TRUE, FTAG); } return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); if (dmu_objset_userobjspace_upgradable(os) || dmu_objset_projectquota_upgradable(os)) { mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_id == 0) { /* clear potential error code and retry */ os->os_upgrade_status = 0; mutex_exit(&os->os_upgrade_lock); dmu_objset_id_quota_upgrade(os); } else { mutex_exit(&os->os_upgrade_lock); } dsl_pool_rele(dmu_objset_pool(os), FTAG); taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id); error = os->os_upgrade_status; } else { dsl_pool_rele(dmu_objset_pool(os), FTAG); } dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } static int zfs_ioc_share(zfs_cmd_t *zc) { return (SET_ERROR(ENOSYS)); } ace_t full_access[] = { {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} }; /* * inputs: * zc_name name of containing filesystem * zc_obj object # beyond which we want next in-use object # * * outputs: * zc_obj next in-use object # */ static int zfs_ioc_next_obj(zfs_cmd_t *zc) { objset_t *os = NULL; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) return (error); error = dmu_object_next(os, &zc->zc_obj, B_FALSE, 0); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_value prefix name for snapshot * zc_cleanup_fd cleanup-on-exit file descriptor for calling process * * outputs: * zc_value short name of new snapshot */ static int zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) { char *snap_name; char *hold_name; int error; minor_t minor; error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); if (error != 0) return (error); snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, (u_longlong_t)ddi_get_lbolt64()); hold_name = kmem_asprintf("%%%s", zc->zc_value); error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, hold_name); if (error == 0) (void) strlcpy(zc->zc_value, snap_name, sizeof (zc->zc_value)); kmem_strfree(snap_name); kmem_strfree(hold_name); zfs_onexit_fd_rele(zc->zc_cleanup_fd); return (error); } /* * inputs: * zc_name name of "to" snapshot * zc_value name of "from" snapshot * zc_cookie file descriptor to write diff data on * * outputs: * dmu_diff_record_t's to the file descriptor */ static int zfs_ioc_diff(zfs_cmd_t *zc) { zfs_file_t *fp; offset_t off; int error; if ((error = zfs_file_get(zc->zc_cookie, &fp))) return (error); off = zfs_file_off(fp); error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off); zfs_file_put(zc->zc_cookie); return (error); } static int zfs_ioc_smb_acl(zfs_cmd_t *zc) { return (SET_ERROR(ENOTSUP)); } /* * innvl: { * "holds" -> { snapname -> holdname (string), ... } * (optional) "cleanup_fd" -> fd (int32) * } * * outnvl: { * snapname -> error value (int32) * ... * } */ static const zfs_ioc_key_t zfs_keys_hold[] = { {"holds", DATA_TYPE_NVLIST, 0}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) { nvpair_t *pair; nvlist_t *holds; int cleanup_fd = -1; int error; minor_t minor = 0; holds = fnvlist_lookup_nvlist(args, "holds"); /* make sure the user didn't pass us any invalid (empty) tags */ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char *htag; error = nvpair_value_string(pair, &htag); if (error != 0) return (SET_ERROR(error)); if (strlen(htag) == 0) return (SET_ERROR(EINVAL)); } if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { error = zfs_onexit_fd_hold(cleanup_fd, &minor); if (error != 0) return (SET_ERROR(error)); } error = dsl_dataset_user_hold(holds, minor, errlist); if (minor != 0) zfs_onexit_fd_rele(cleanup_fd); return (SET_ERROR(error)); } /* * innvl is not used. * * outnvl: { * holdname -> time added (uint64 seconds since epoch) * ... * } */ static const zfs_ioc_key_t zfs_keys_get_holds[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) { return (dsl_dataset_get_holds(snapname, outnvl)); } /* * innvl: { * snapname -> { holdname, ... } * ... * } * * outnvl: { * snapname -> error value (int32) * ... * } */ static const zfs_ioc_key_t zfs_keys_release[] = { {"...", DATA_TYPE_NVLIST, ZK_WILDCARDLIST}, }; /* ARGSUSED */ static int zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) { return (dsl_dataset_user_release(holds, errlist)); } /* * inputs: * zc_guid flags (ZEVENT_NONBLOCK) * zc_cleanup_fd zevent file descriptor * * outputs: * zc_nvlist_dst next nvlist event * zc_cookie dropped events since last get */ static int zfs_ioc_events_next(zfs_cmd_t *zc) { zfs_zevent_t *ze; nvlist_t *event = NULL; minor_t minor; uint64_t dropped = 0; int error; error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); if (error != 0) return (error); do { error = zfs_zevent_next(ze, &event, &zc->zc_nvlist_dst_size, &dropped); if (event != NULL) { zc->zc_cookie = dropped; error = put_nvlist(zc, event); nvlist_free(event); } if (zc->zc_guid & ZEVENT_NONBLOCK) break; if ((error == 0) || (error != ENOENT)) break; error = zfs_zevent_wait(ze); if (error != 0) break; } while (1); zfs_zevent_fd_rele(zc->zc_cleanup_fd); return (error); } /* * outputs: * zc_cookie cleared events count */ static int zfs_ioc_events_clear(zfs_cmd_t *zc) { int count; zfs_zevent_drain_all(&count); zc->zc_cookie = count; return (0); } /* * inputs: * zc_guid eid | ZEVENT_SEEK_START | ZEVENT_SEEK_END * zc_cleanup zevent file descriptor */ static int zfs_ioc_events_seek(zfs_cmd_t *zc) { zfs_zevent_t *ze; minor_t minor; int error; error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); if (error != 0) return (error); error = zfs_zevent_seek(ze, zc->zc_guid); zfs_zevent_fd_rele(zc->zc_cleanup_fd); return (error); } /* * inputs: * zc_name name of later filesystem or snapshot * zc_value full name of old snapshot or bookmark * * outputs: * zc_cookie space in bytes * zc_objset_type compressed space in bytes * zc_perm_action uncompressed space in bytes */ static int zfs_ioc_space_written(zfs_cmd_t *zc) { int error; dsl_pool_t *dp; dsl_dataset_t *new; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (strchr(zc->zc_value, '#') != NULL) { zfs_bookmark_phys_t bmp; error = dsl_bookmark_lookup(dp, zc->zc_value, new, &bmp); if (error == 0) { error = dsl_dataset_space_written_bookmark(&bmp, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); } } else { dsl_dataset_t *old; error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); if (error == 0) { error = dsl_dataset_space_written(old, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); dsl_dataset_rele(old, FTAG); } } dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * innvl: { * "firstsnap" -> snapshot name * } * * outnvl: { * "used" -> space in bytes * "compressed" -> compressed space in bytes * "uncompressed" -> uncompressed space in bytes * } */ static const zfs_ioc_key_t zfs_keys_space_snaps[] = { {"firstsnap", DATA_TYPE_STRING, 0}, }; static int zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) { int error; dsl_pool_t *dp; dsl_dataset_t *new, *old; char *firstsnap; uint64_t used, comp, uncomp; firstsnap = fnvlist_lookup_string(innvl, "firstsnap"); error = dsl_pool_hold(lastsnap, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); if (error == 0 && !new->ds_is_snapshot) { dsl_dataset_rele(new, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); if (error == 0 && !old->ds_is_snapshot) { dsl_dataset_rele(old, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); fnvlist_add_uint64(outnvl, "used", used); fnvlist_add_uint64(outnvl, "compressed", comp); fnvlist_add_uint64(outnvl, "uncompressed", uncomp); return (error); } /* * innvl: { * "fd" -> file descriptor to write stream to (int32) * (optional) "fromsnap" -> full snap name to send an incremental from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. * (optional) "savedok" -> (value ignored) * presence indicates we should send a partially received snapshot * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * (optional) "redactbook" -> (string) * if present, use this bookmark's redaction list to generate a redacted * send stream * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_send_new[] = { {"fd", DATA_TYPE_INT32, 0}, {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"savedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { int error; offset_t off; char *fromname = NULL; int fd; zfs_file_t *fp; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; boolean_t rawok; boolean_t savedok; uint64_t resumeobj = 0; uint64_t resumeoff = 0; char *redactbook = NULL; fd = fnvlist_lookup_int32(innvl, "fd"); (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); savedok = nvlist_exists(innvl, "savedok"); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); (void) nvlist_lookup_string(innvl, "redactbook", &redactbook); if ((error = zfs_file_get(fd, &fp))) return (error); off = zfs_file_off(fp); dmu_send_outparams_t out = {0}; out.dso_outfunc = dump_bytes; out.dso_arg = fp; out.dso_dryrun = B_FALSE; error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactbook, fd, &off, &out); zfs_file_put(fd); return (error); } /* ARGSUSED */ static int send_space_sum(objset_t *os, void *buf, int len, void *arg) { uint64_t *size = arg; *size += len; return (0); } /* * Determine approximately how large a zfs send stream will be -- the number * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). * * innvl: { * (optional) "from" -> full snap or bookmark name to send an incremental * from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. * (optional) "fd" -> file descriptor to use as a cookie for progress * tracking (int32) * } * * outnvl: { * "space" -> bytes of space (uint64) * } */ static const zfs_ioc_key_t zfs_keys_send_space[] = { {"from", DATA_TYPE_STRING, ZK_OPTIONAL}, {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, {"resumeobj", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"resumeoff", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"bytes", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; int error; char *fromname = NULL; char *redactlist_book = NULL; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; boolean_t rawok; boolean_t savedok; uint64_t space = 0; boolean_t full_estimate = B_FALSE; uint64_t resumeobj = 0; uint64_t resumeoff = 0; uint64_t resume_bytes = 0; int32_t fd = -1; zfs_bookmark_phys_t zbm = {0}; error = dsl_pool_hold(snapname, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } (void) nvlist_lookup_int32(innvl, "fd", &fd); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); savedok = nvlist_exists(innvl, "savedok"); boolean_t from = (nvlist_lookup_string(innvl, "from", &fromname) == 0); boolean_t altbook = (nvlist_lookup_string(innvl, "redactbook", &redactlist_book) == 0); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); (void) nvlist_lookup_uint64(innvl, "bytes", &resume_bytes); if (altbook) { full_estimate = B_TRUE; } else if (from) { if (strchr(fromname, '#')) { error = dsl_bookmark_lookup(dp, fromname, tosnap, &zbm); /* * dsl_bookmark_lookup() will fail with EXDEV if * the from-bookmark and tosnap are at the same txg. * However, it's valid to do a send (and therefore, * a send estimate) from and to the same time point, * if the bookmark is redacted (the incremental send * can change what's redacted on the target). In * this case, dsl_bookmark_lookup() fills in zbm * but returns EXDEV. Ignore this error. */ if (error == EXDEV && zbm.zbm_redaction_obj != 0 && zbm.zbm_guid == dsl_dataset_phys(tosnap)->ds_guid) error = 0; if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } if (zbm.zbm_redaction_obj != 0 || !(zbm.zbm_flags & ZBM_FLAG_HAS_FBN)) { full_estimate = B_TRUE; } } else if (strchr(fromname, '@')) { error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) { full_estimate = B_TRUE; dsl_dataset_rele(fromsnap, FTAG); } } else { /* * from is not properly formatted as a snapshot or * bookmark */ dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EINVAL)); } } if (full_estimate) { dmu_send_outparams_t out = {0}; offset_t off = 0; out.dso_outfunc = send_space_sum; out.dso_arg = &space; out.dso_dryrun = B_TRUE; /* * We have to release these holds so dmu_send can take them. It * will do all the error checking we need. */ dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactlist_book, fd, &off, &out); } else { error = dmu_send_estimate_fast(tosnap, fromsnap, (from && strchr(fromname, '#') != NULL ? &zbm : NULL), compressok || rawok, savedok, &space); space -= resume_bytes; if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } fnvlist_add_uint64(outnvl, "space", space); return (error); } /* * Sync the currently open TXG to disk for the specified pool. * This is somewhat similar to 'zfs_sync()'. * For cases that do not result in error this ioctl will wait for * the currently open TXG to commit before returning back to the caller. * * innvl: { * "force" -> when true, force uberblock update even if there is no dirty data. * In addition this will cause the vdev configuration to be written * out including updating the zpool cache file. (boolean_t) * } * * onvl is unused */ static const zfs_ioc_key_t zfs_keys_pool_sync[] = { {"force", DATA_TYPE_BOOLEAN_VALUE, 0}, }; /* ARGSUSED */ static int zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) { int err; boolean_t force = B_FALSE; spa_t *spa; if ((err = spa_open(pool, &spa, FTAG)) != 0) return (err); if (innvl) force = fnvlist_lookup_boolean_value(innvl, "force"); if (force) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER); vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } txg_wait_synced(spa_get_dsl(spa), 0); spa_close(spa, FTAG); return (err); } /* * Load a user's wrapping key into the kernel. * innvl: { * "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * (optional) "noop" -> (value ignored) * presence indicated key should only be verified, not loaded * } */ static const zfs_ioc_key_t zfs_keys_load_key[] = { {"hidden_args", DATA_TYPE_NVLIST, 0}, {"noop", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { int ret; dsl_crypto_params_t *dcp = NULL; nvlist_t *hidden_args; boolean_t noop = nvlist_exists(innvl, "noop"); if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = SET_ERROR(EINVAL); goto error; } hidden_args = fnvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS); ret = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, hidden_args, &dcp); if (ret != 0) goto error; ret = spa_keystore_load_wkey(dsname, dcp, noop); if (ret != 0) goto error; dsl_crypto_params_free(dcp, noop); return (0); error: dsl_crypto_params_free(dcp, B_TRUE); return (ret); } /* * Unload a user's wrapping key from the kernel. * Both innvl and outnvl are unused. */ static const zfs_ioc_key_t zfs_keys_unload_key[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { int ret = 0; if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = (SET_ERROR(EINVAL)); goto out; } ret = spa_keystore_unload_wkey(dsname); if (ret != 0) goto out; out: return (ret); } /* * Changes a user's wrapping key used to decrypt a dataset. The keyformat, * keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified * here to change how the key is derived in userspace. * * innvl: { * "hidden_args" (optional) -> { "wkeydata" -> value } * raw uint8_t array of new encryption wrapping key data (32 bytes) * "props" (optional) -> { prop -> value } * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_change_key[] = { {"crypt_cmd", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { int ret; uint64_t cmd = DCP_CMD_NONE; dsl_crypto_params_t *dcp = NULL; nvlist_t *args = NULL, *hidden_args = NULL; if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = (SET_ERROR(EINVAL)); goto error; } (void) nvlist_lookup_uint64(innvl, "crypt_cmd", &cmd); (void) nvlist_lookup_nvlist(innvl, "props", &args); (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); ret = dsl_crypto_params_create_nvlist(cmd, args, hidden_args, &dcp); if (ret != 0) goto error; ret = spa_keystore_change_key(dsname, dcp); if (ret != 0) goto error; dsl_crypto_params_free(dcp, B_FALSE); return (0); error: dsl_crypto_params_free(dcp, B_TRUE); return (ret); } static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; static void zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); vec->zvec_legacy_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_allow_log = log_history; vec->zvec_pool_check = pool_check; } /* * See the block comment at the beginning of this file for details on * each argument to this function. */ void zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, boolean_t allow_log, const zfs_ioc_key_t *nvl_keys, size_t num_keys) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); /* if we are logging, the name must be valid */ ASSERT(!allow_log || namecheck != NO_NAME); vec->zvec_name = name; vec->zvec_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_pool_check = pool_check; vec->zvec_smush_outnvlist = smush_outnvlist; vec->zvec_allow_log = allow_log; vec->zvec_nvl_keys = nvl_keys; vec->zvec_nvl_key_count = num_keys; } static void zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, POOL_NAME, log_history, pool_check); } void zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, pool_check); } static void zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, NO_NAME, B_FALSE, POOL_CHECK_NONE); } static void zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); } static void zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_dataset_read_secpolicy(ioc, func, zfs_secpolicy_read); } static void zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_init(void) { zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_snapshot, ARRAY_SIZE(zfs_keys_snapshot)); zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_log_history, ARRAY_SIZE(zfs_keys_log_history)); zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_space_snaps, ARRAY_SIZE(zfs_keys_space_snaps)); zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_send_new, ARRAY_SIZE(zfs_keys_send_new)); zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_send_space, ARRAY_SIZE(zfs_keys_send_space)); zfs_ioctl_register("create", ZFS_IOC_CREATE, zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_create, ARRAY_SIZE(zfs_keys_create)); zfs_ioctl_register("clone", ZFS_IOC_CLONE, zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone)); zfs_ioctl_register("remap", ZFS_IOC_REMAP, zfs_ioc_remap, zfs_secpolicy_none, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap)); zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_destroy_snaps, ARRAY_SIZE(zfs_keys_destroy_snaps)); zfs_ioctl_register("hold", ZFS_IOC_HOLD, zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_hold, ARRAY_SIZE(zfs_keys_hold)); zfs_ioctl_register("release", ZFS_IOC_RELEASE, zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_release, ARRAY_SIZE(zfs_keys_release)); zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_holds, ARRAY_SIZE(zfs_keys_get_holds)); zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_rollback, ARRAY_SIZE(zfs_keys_rollback)); zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK, zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_bookmark, ARRAY_SIZE(zfs_keys_bookmark)); zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS, zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks)); zfs_ioctl_register("get_bookmark_props", ZFS_IOC_GET_BOOKMARK_PROPS, zfs_ioc_get_bookmark_props, zfs_secpolicy_read, ENTITY_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmark_props, ARRAY_SIZE(zfs_keys_get_bookmark_props)); zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_destroy_bookmarks, ARRAY_SIZE(zfs_keys_destroy_bookmarks)); zfs_ioctl_register("receive", ZFS_IOC_RECV_NEW, zfs_ioc_recv_new, zfs_secpolicy_recv_new, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_recv_new, ARRAY_SIZE(zfs_keys_recv_new)); zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY, zfs_ioc_load_key, zfs_secpolicy_load_key, DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_load_key, ARRAY_SIZE(zfs_keys_load_key)); zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY, zfs_ioc_unload_key, zfs_secpolicy_load_key, DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_unload_key, ARRAY_SIZE(zfs_keys_unload_key)); zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY, zfs_ioc_change_key, zfs_secpolicy_change_key, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_change_key, ARRAY_SIZE(zfs_keys_change_key)); zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC, zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync)); zfs_ioctl_register("reopen", ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_pool_reopen, ARRAY_SIZE(zfs_keys_pool_reopen)); zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM, zfs_ioc_channel_program, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_channel_program, ARRAY_SIZE(zfs_keys_channel_program)); zfs_ioctl_register("redact", ZFS_IOC_REDACT, zfs_ioc_redact, zfs_secpolicy_config, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_redact, ARRAY_SIZE(zfs_keys_redact)); zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT, zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_checkpoint, ARRAY_SIZE(zfs_keys_pool_checkpoint)); zfs_ioctl_register("zpool_discard_checkpoint", ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_discard_checkpoint, ARRAY_SIZE(zfs_keys_pool_discard_checkpoint)); zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE, zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize)); zfs_ioctl_register("trim", ZFS_IOC_POOL_TRIM, zfs_ioc_pool_trim, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_trim, ARRAY_SIZE(zfs_keys_pool_trim)); zfs_ioctl_register("wait", ZFS_IOC_WAIT, zfs_ioc_wait, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_wait, ARRAY_SIZE(zfs_keys_pool_wait)); zfs_ioctl_register("wait_fs", ZFS_IOC_WAIT_FS, zfs_ioc_wait_fs, zfs_secpolicy_none, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_fs_wait, ARRAY_SIZE(zfs_keys_fs_wait)); zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV, zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv)); zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV, zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE, zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv)); /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, zfs_ioc_pool_scan); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, zfs_ioc_pool_upgrade); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, zfs_ioc_vdev_add); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, zfs_ioc_vdev_remove); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, zfs_ioc_vdev_set_state); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, zfs_ioc_vdev_attach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, zfs_ioc_vdev_detach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, zfs_ioc_vdev_setpath); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, zfs_ioc_vdev_setfru); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, zfs_ioc_pool_set_props); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, zfs_ioc_vdev_split); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, zfs_ioc_pool_reguid); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, zfs_ioc_pool_configs, zfs_secpolicy_none); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT, zfs_ioc_pool_tryimport, zfs_secpolicy_config); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT, zfs_ioc_inject_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT, zfs_ioc_clear_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT, zfs_ioc_inject_list_next, zfs_secpolicy_inject); /* * pool destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_export * does the logging of those commands. */ zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, zfs_ioc_pool_get_history, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY); zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, zfs_ioc_space_written); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, zfs_ioc_objset_recvd_props); zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, zfs_ioc_next_obj); zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL, zfs_ioc_get_fsacl); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS, zfs_ioc_objset_stats); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS, zfs_ioc_objset_zplprops); zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT, zfs_ioc_dataset_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT, zfs_ioc_snapshot_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS, zfs_ioc_send_progress); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF, zfs_ioc_diff, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS, zfs_ioc_obj_to_stats, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH, zfs_ioc_obj_to_path, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE, zfs_ioc_userspace_one, zfs_secpolicy_userspace_one); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY, zfs_ioc_userspace_many, zfs_secpolicy_userspace_many); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, zfs_ioc_send, zfs_secpolicy_send); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, zfs_secpolicy_none); zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, zfs_secpolicy_destroy); zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, zfs_secpolicy_rename); zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, zfs_secpolicy_recv); zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, zfs_secpolicy_promote); zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, zfs_secpolicy_set_fsacl); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, zfs_secpolicy_share, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE, zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT, zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_NEXT, zfs_ioc_events_next, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_CLEAR, zfs_ioc_events_clear, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_SEEK, zfs_ioc_events_seek, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_init_os(); } /* * Verify that for non-legacy ioctls the input nvlist * pairs match against the expected input. * * Possible errors are: * ZFS_ERR_IOC_ARG_UNAVAIL An unrecognized nvpair was encountered * ZFS_ERR_IOC_ARG_REQUIRED A required nvpair is missing * ZFS_ERR_IOC_ARG_BADTYPE Invalid type for nvpair */ static int zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec) { const zfs_ioc_key_t *nvl_keys = vec->zvec_nvl_keys; boolean_t required_keys_found = B_FALSE; /* * examine each input pair */ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *name = nvpair_name(pair); data_type_t type = nvpair_type(pair); boolean_t identified = B_FALSE; /* * check pair against the documented names and type */ for (int k = 0; k < vec->zvec_nvl_key_count; k++) { /* if not a wild card name, check for an exact match */ if ((nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) == 0 && strcmp(nvl_keys[k].zkey_name, name) != 0) continue; identified = B_TRUE; if (nvl_keys[k].zkey_type != DATA_TYPE_ANY && nvl_keys[k].zkey_type != type) { return (SET_ERROR(ZFS_ERR_IOC_ARG_BADTYPE)); } if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) continue; required_keys_found = B_TRUE; break; } /* allow an 'optional' key, everything else is invalid */ if (!identified && (strcmp(name, "optional") != 0 || type != DATA_TYPE_NVLIST)) { return (SET_ERROR(ZFS_ERR_IOC_ARG_UNAVAIL)); } } /* verify that all required keys were found */ for (int k = 0; k < vec->zvec_nvl_key_count; k++) { if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) continue; if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) { /* at least one non-optional key is expected here */ if (!required_keys_found) return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); continue; } if (!nvlist_exists(innvl, nvl_keys[k].zkey_name)) return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); } return (0); } static int pool_status_check(const char *name, zfs_ioc_namecheck_t type, zfs_ioc_poolcheck_t check) { spa_t *spa; int error; ASSERT(type == POOL_NAME || type == DATASET_NAME || type == ENTITY_NAME); if (check & POOL_CHECK_NONE) return (0); error = spa_open(name, &spa, FTAG); if (error == 0) { if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) error = SET_ERROR(EAGAIN); else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) error = SET_ERROR(EROFS); spa_close(spa, FTAG); } return (error); } int zfsdev_getminor(int fd, minor_t *minorp) { zfsdev_state_t *zs, *fpd; zfs_file_t *fp; int rc; ASSERT(!MUTEX_HELD(&zfsdev_state_lock)); if ((rc = zfs_file_get(fd, &fp))) return (rc); fpd = zfs_file_private(fp); if (fpd == NULL) return (SET_ERROR(EBADF)); mutex_enter(&zfsdev_state_lock); for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == -1) continue; if (fpd == zs) { *minorp = fpd->zs_minor; mutex_exit(&zfsdev_state_lock); return (0); } } mutex_exit(&zfsdev_state_lock); return (SET_ERROR(EBADF)); } static void * zfsdev_get_state_impl(minor_t minor, enum zfsdev_state_type which) { zfsdev_state_t *zs; for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == minor) { smp_rmb(); switch (which) { case ZST_ONEXIT: return (zs->zs_onexit); case ZST_ZEVENT: return (zs->zs_zevent); case ZST_ALL: return (zs); } } } return (NULL); } void * zfsdev_get_state(minor_t minor, enum zfsdev_state_type which) { void *ptr; ptr = zfsdev_get_state_impl(minor, which); return (ptr); } /* * Find a free minor number. The zfsdev_state_list is expected to * be short since it is only a list of currently open file handles. */ minor_t zfsdev_minor_alloc(void) { static minor_t last_minor = 0; minor_t m; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); for (m = last_minor + 1; m != last_minor; m++) { if (m > ZFSDEV_MAX_MINOR) m = 1; if (zfsdev_get_state_impl(m, ZST_ALL) == NULL) { last_minor = m; return (m); } } return (0); } long zfsdev_ioctl_common(uint_t vecnum, zfs_cmd_t *zc, int flag) { int error, cmd; const zfs_ioc_vec_t *vec; char *saved_poolname = NULL; uint64_t max_nvlist_src_size; size_t saved_poolname_len = 0; nvlist_t *innvl = NULL; fstrans_cookie_t cookie; cmd = vecnum; error = 0; if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); vec = &zfs_ioc_vec[vecnum]; /* * The registered ioctl list may be sparse, verify that either * a normal or legacy handler are registered. */ if (vec->zvec_func == NULL && vec->zvec_legacy_func == NULL) return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); zc->zc_iflags = flag & FKIOCTL; max_nvlist_src_size = zfs_max_nvlist_src_size_os(); if (zc->zc_nvlist_src_size > max_nvlist_src_size) { /* * Make sure the user doesn't pass in an insane value for * zc_nvlist_src_size. We have to check, since we will end * up allocating that much memory inside of get_nvlist(). This * prevents a nefarious user from allocating tons of kernel * memory. * * Also, we return EINVAL instead of ENOMEM here. The reason * being that returning ENOMEM from an ioctl() has a special * connotation; that the user's size value is too small and * needs to be expanded to hold the nvlist. See * zcmd_expand_dst_nvlist() for details. */ error = SET_ERROR(EINVAL); /* User's size too big */ } else if (zc->zc_nvlist_src_size != 0) { error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &innvl); if (error != 0) goto out; } /* * Ensure that all pool/dataset names are valid before we pass down to * the lower layers. */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; switch (vec->zvec_namecheck) { case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case DATASET_NAME: if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case ENTITY_NAME: if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) { error = SET_ERROR(EINVAL); } else { error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); } break; case NO_NAME: break; } /* * Ensure that all input pairs are valid before we pass them down * to the lower layers. * * The vectored functions can use fnvlist_lookup_{type} for any * required pairs since zfs_check_input_nvpairs() confirmed that * they exist and are of the correct type. */ if (error == 0 && vec->zvec_func != NULL) { error = zfs_check_input_nvpairs(innvl, vec); if (error != 0) goto out; } if (error == 0) { cookie = spl_fstrans_mark(); error = vec->zvec_secpolicy(zc, innvl, CRED()); spl_fstrans_unmark(cookie); } if (error != 0) goto out; /* legacy ioctls can modify zc_name */ /* * Can't use kmem_strdup() as we might truncate the string and * kmem_strfree() would then free with incorrect size. */ saved_poolname_len = strlen(zc->zc_name) + 1; saved_poolname = kmem_alloc(saved_poolname_len, KM_SLEEP); strlcpy(saved_poolname, zc->zc_name, saved_poolname_len); saved_poolname[strcspn(saved_poolname, "/@#")] = '\0'; if (vec->zvec_func != NULL) { nvlist_t *outnvl; int puterror = 0; spa_t *spa; nvlist_t *lognv = NULL; ASSERT(vec->zvec_legacy_func == NULL); /* * Add the innvl to the lognv before calling the func, * in case the func changes the innvl. */ if (vec->zvec_allow_log) { lognv = fnvlist_alloc(); fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, vec->zvec_name); if (!nvlist_empty(innvl)) { fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL, innvl); } } outnvl = fnvlist_alloc(); cookie = spl_fstrans_mark(); error = vec->zvec_func(zc->zc_name, innvl, outnvl); spl_fstrans_unmark(cookie); /* * Some commands can partially execute, modify state, and still * return an error. In these cases, attempt to record what * was modified. */ if ((error == 0 || (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) && vec->zvec_allow_log && spa_open(zc->zc_name, &spa, FTAG) == 0) { if (!nvlist_empty(outnvl)) { fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, outnvl); } if (error != 0) { fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO, error); } (void) spa_history_log_nvl(spa, lognv); spa_close(spa, FTAG); } fnvlist_free(lognv); if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { int smusherror = 0; if (vec->zvec_smush_outnvlist) { smusherror = nvlist_smush(outnvl, zc->zc_nvlist_dst_size); } if (smusherror == 0) puterror = put_nvlist(zc, outnvl); } if (puterror != 0) error = puterror; nvlist_free(outnvl); } else { cookie = spl_fstrans_mark(); error = vec->zvec_legacy_func(zc); spl_fstrans_unmark(cookie); } out: nvlist_free(innvl); if (error == 0 && vec->zvec_allow_log) { char *s = tsd_get(zfs_allow_log_key); if (s != NULL) kmem_strfree(s); (void) tsd_set(zfs_allow_log_key, kmem_strdup(saved_poolname)); } if (saved_poolname != NULL) kmem_free(saved_poolname, saved_poolname_len); return (error); } int zfs_kmod_init(void) { int error; if ((error = zvol_init()) != 0) return (error); spa_init(SPA_MODE_READ | SPA_MODE_WRITE); zfs_init(); zfs_ioctl_init(); mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL); zfsdev_state_list = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); zfsdev_state_list->zs_minor = -1; if ((error = zfsdev_attach()) != 0) goto out; tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); return (0); out: zfs_fini(); spa_fini(); zvol_fini(); return (error); } void zfs_kmod_fini(void) { zfsdev_state_t *zs, *zsnext = NULL; zfsdev_detach(); mutex_destroy(&zfsdev_state_lock); for (zs = zfsdev_state_list; zs != NULL; zs = zsnext) { zsnext = zs->zs_next; if (zs->zs_onexit) zfs_onexit_destroy(zs->zs_onexit); if (zs->zs_zevent) zfs_zevent_destroy(zs->zs_zevent); kmem_free(zs, sizeof (zfsdev_state_t)); } zfs_fini(); spa_fini(); zvol_fini(); tsd_destroy(&zfs_fsyncer_key); tsd_destroy(&rrw_tsd_key); tsd_destroy(&zfs_allow_log_key); } /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, ULONG, ZMOD_RW, "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls"); /* END CSTYLED */ Index: vendor-sys/openzfs/dist/module/zfs/zio.c =================================================================== --- vendor-sys/openzfs/dist/module/zfs/zio.c (revision 365339) +++ vendor-sys/openzfs/dist/module/zfs/zio.c (revision 365340) @@ -1,4969 +1,4969 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * ========================================================================== * I/O type descriptions * ========================================================================== */ const char *zio_type_name[ZIO_TYPES] = { /* * Note: Linux kernel thread name length is limited * so these names will differ from upstream open zfs. */ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim" }; int zio_dva_throttle_enabled = B_TRUE; int zio_deadman_log_all = B_FALSE; /* * ========================================================================== * I/O kmem caches * ========================================================================== */ kmem_cache_t *zio_cache; kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #if defined(ZFS_DEBUG) && !defined(_KERNEL) uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #endif /* Mark IOs as "slow" if they take longer than 30 seconds */ int zio_slow_io_ms = (30 * MILLISEC); #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) #define COMPARE_META_LEVEL 0x80000000ul /* * The following actions directly effect the spa's sync-to-convergence logic. * The values below define the sync pass when we start performing the action. * Care should be taken when changing these values as they directly impact * spa_sync() performance. Tuning these values may introduce subtle performance * pathologies and should only be done in the context of performance analysis. * These tunables will eventually be removed and replaced with #defines once * enough analysis has been done to determine optimal values. * * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that * regular blocks are not deferred. * * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable * compression (including of metadata). In practice, we don't have this * many sync passes, so this has no effect. * * The original intent was that disabling compression would help the sync * passes to converge. However, in practice disabling compression increases * the average number of sync passes, because when we turn compression off, a * lot of block's size will change and thus we have to re-allocate (not * overwrite) them. It also increases the number of 128KB allocations (e.g. * for indirect blocks and spacemaps) because these will not be compressed. * The 128K allocations are especially detrimental to performance on highly * fragmented systems, which may have very few free segments of this size, * and may need to load new metaslabs to satisfy 128K allocations. */ int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ int zfs_sync_pass_dont_compress = 8; /* don't compress starting in this pass */ int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ /* * An allocating zio is one that either currently has the DVA allocate * stage set or will have it later in its lifetime. */ #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) /* * Enable smaller cores by excluding metadata * allocations as well. */ int zio_exclude_metadata = 0; int zio_requeue_io_start_cut_in_line = 1; #ifdef ZFS_DEBUG int zio_buf_debug_limit = 16384; #else int zio_buf_debug_limit = 0; #endif static inline void __zio_execute(zio_t *zio); static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); void zio_init(void) { size_t c; vmem_t *data_alloc_arena = NULL; zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); /* * For small buffers, we want a cache for each multiple of * SPA_MINBLOCKSIZE. For larger buffers, we want a cache * for each quarter-power of 2. */ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; size_t p2 = size; size_t align = 0; size_t data_cflags, cflags; data_cflags = KMC_NODEBUG; cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; #if defined(_ILP32) && defined(_KERNEL) /* * Cache size limited to 1M on 32-bit platforms until ARC * buffers no longer require virtual address space. */ if (size > zfs_max_recordsize) break; #endif while (!ISP2(p2)) p2 &= p2 - 1; #ifndef _KERNEL /* * If we are using watchpoints, put each buffer on its own page, * to eliminate the performance overhead of trapping to the * kernel when modifying a non-watched buffer that shares the * page with a watched buffer. */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; /* * Here's the problem - on 4K native devices in userland on * Linux using O_DIRECT, buffers must be 4K aligned or I/O * will fail with EINVAL, causing zdb (and others) to coredump. * Since userland probably doesn't need optimized buffer caches, * we just force 4K alignment on everything. */ align = 8 * SPA_MINBLOCKSIZE; #else if (size < PAGESIZE) { align = SPA_MINBLOCKSIZE; } else if (IS_P2ALIGNED(size, p2 >> 2)) { align = PAGESIZE; } #endif if (align != 0) { char name[36]; (void) snprintf(name, sizeof (name), "zio_buf_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", (ulong_t)size); zio_data_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, data_alloc_arena, data_cflags); } } while (--c != 0) { ASSERT(zio_buf_cache[c] != NULL); if (zio_buf_cache[c - 1] == NULL) zio_buf_cache[c - 1] = zio_buf_cache[c]; ASSERT(zio_data_buf_cache[c] != NULL); if (zio_data_buf_cache[c - 1] == NULL) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } zio_inject_init(); lz4_init(); } void zio_fini(void) { size_t c; kmem_cache_t *last_cache = NULL; kmem_cache_t *last_data_cache = NULL; for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { #ifdef _ILP32 /* * Cache size limited to 1M on 32-bit platforms until ARC * buffers no longer require virtual address space. */ if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize) break; #endif #if defined(ZFS_DEBUG) && !defined(_KERNEL) if (zio_buf_cache_allocs[c] != zio_buf_cache_frees[c]) (void) printf("zio_fini: [%d] %llu != %llu\n", (int)((c + 1) << SPA_MINBLOCKSHIFT), (long long unsigned)zio_buf_cache_allocs[c], (long long unsigned)zio_buf_cache_frees[c]); #endif if (zio_buf_cache[c] != last_cache) { last_cache = zio_buf_cache[c]; kmem_cache_destroy(zio_buf_cache[c]); } zio_buf_cache[c] = NULL; if (zio_data_buf_cache[c] != last_data_cache) { last_data_cache = zio_data_buf_cache[c]; kmem_cache_destroy(zio_data_buf_cache[c]); } zio_data_buf_cache[c] = NULL; } kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); zio_inject_fini(); lz4_fini(); } /* * ========================================================================== * Allocate and free I/O buffers * ========================================================================== */ /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * crashdump if the kernel panics, so use it judiciously. Obviously, it's * useful to inspect ZFS metadata, but if possible, we should avoid keeping * excess / transient data in-core during a crashdump. */ void * zio_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_allocs[c], 1); #endif return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); } /* * Use zio_data_buf_alloc to allocate data. The data will not appear in a * crashdump if the kernel panics. This exists so that we will limit the amount * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount * of kernel heap dumped to disk when the kernel panics) */ void * zio_data_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); } void zio_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_frees[c], 1); #endif kmem_cache_free(zio_buf_cache[c], buf); } void zio_data_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); kmem_cache_free(zio_data_buf_cache[c], buf); } static void zio_abd_free(void *abd, size_t size) { abd_free((abd_t *)abd); } /* * ========================================================================== * Push and pop I/O transform buffers * ========================================================================== */ void zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); zt->zt_orig_abd = zio->io_abd; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; zt->zt_transform = transform; zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; zio->io_abd = data; zio->io_size = size; } void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; while ((zt = zio->io_transform_stack) != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, zt->zt_orig_abd, zt->zt_orig_size); if (zt->zt_bufsize != 0) abd_free(zio->io_abd); zio->io_abd = zt->zt_orig_abd; zio->io_size = zt->zt_orig_size; zio->io_transform_stack = zt->zt_next; kmem_free(zt, sizeof (zio_transform_t)); } } /* * ========================================================================== * I/O transform callbacks for subblocks, decompression, and decryption * ========================================================================== */ static void zio_subblock(zio_t *zio, abd_t *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) abd_copy(data, zio->io_abd, size); } static void zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { if (zio->io_error == 0) { void *tmp = abd_borrow_buf(data, size); int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), zio->io_abd, tmp, zio->io_size, size, &zio->io_prop.zp_complevel); abd_return_buf_copy(data, tmp, size); if (zio_injection_enabled && ret == 0) ret = zio_handle_fault_injection(zio, EINVAL); if (ret != 0) zio->io_error = SET_ERROR(EIO); } } static void zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) { int ret; void *tmp; blkptr_t *bp = zio->io_bp; spa_t *spa = zio->io_spa; uint64_t dsobj = zio->io_bookmark.zb_objset; uint64_t lsize = BP_GET_LSIZE(bp); dmu_object_type_t ot = BP_GET_TYPE(bp); uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; ASSERT(BP_USES_CRYPT(bp)); ASSERT3U(size, !=, 0); if (zio->io_error != 0) return; /* * Verify the cksum of MACs stored in an indirect bp. It will always * be possible to verify this since it does not require an encryption * key. */ if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { zio_crypt_decode_mac_bp(bp, mac); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { /* * We haven't decompressed the data yet, but * zio_crypt_do_indirect_mac_checksum() requires * decompressed data to be able to parse out the MACs * from the indirect block. We decompress it now and * throw away the result after we are finished. */ tmp = zio_buf_alloc(lsize); ret = zio_decompress_data(BP_GET_COMPRESS(bp), zio->io_abd, tmp, zio->io_size, lsize, &zio->io_prop.zp_complevel); if (ret != 0) { ret = SET_ERROR(EIO); goto error; } ret = zio_crypt_do_indirect_mac_checksum(B_FALSE, tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac); zio_buf_free(tmp, lsize); } else { ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac); } abd_copy(data, zio->io_abd, size); if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) { ret = zio_handle_decrypt_injection(spa, &zio->io_bookmark, ot, ECKSUM); } if (ret != 0) goto error; return; } /* * If this is an authenticated block, just check the MAC. It would be * nice to separate this out into its own flag, but for the moment * enum zio_flag is out of bits. */ if (BP_IS_AUTHENTICATED(bp)) { if (ot == DMU_OT_OBJSET) { ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp)); } else { zio_crypt_decode_mac_bp(bp, mac); ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, zio->io_abd, size, mac); if (zio_injection_enabled && ret == 0) { ret = zio_handle_decrypt_injection(spa, &zio->io_bookmark, ot, ECKSUM); } } abd_copy(data, zio->io_abd, size); if (ret != 0) goto error; return; } zio_crypt_decode_params_bp(bp, salt, iv); if (ot == DMU_OT_INTENT_LOG) { tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); zio_crypt_decode_mac_zil(tmp, mac); abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t)); } else { zio_crypt_decode_mac_bp(bp, mac); } ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data, zio->io_abd, &no_crypt); if (no_crypt) abd_copy(data, zio->io_abd, size); if (ret != 0) goto error; return; error: /* assert that the key was found unless this was speculative */ ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE)); /* * If there was a decryption / authentication error return EIO as * the io_error. If this was not a speculative zio, create an ereport. */ if (ret == ECKSUM) { zio->io_error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, &zio->io_bookmark); - zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, + (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, &zio->io_bookmark, zio, 0, 0); } } else { zio->io_error = ret; } } /* * ========================================================================== * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ zio_t * zio_walk_parents(zio_t *cio, zio_link_t **zl) { list_t *pl = &cio->io_parent_list; *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_child == cio); return ((*zl)->zl_parent); } zio_t * zio_walk_children(zio_t *pio, zio_link_t **zl) { list_t *cl = &pio->io_child_list; ASSERT(MUTEX_HELD(&pio->io_lock)); *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_parent == pio); return ((*zl)->zl_child); } zio_t * zio_unique_parent(zio_t *cio) { zio_link_t *zl = NULL; zio_t *pio = zio_walk_parents(cio, &zl); VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); return (pio); } void zio_add_child(zio_t *pio, zio_t *cio) { zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. * Vdev I/Os can only have vdev children. * The following ASSERT captures all of these constraints. */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); zl->zl_parent = pio; zl->zl_child = cio; mutex_enter(&pio->io_lock); mutex_enter(&cio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); pio->io_child_count++; cio->io_parent_count++; mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); } static void zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { ASSERT(zl->zl_parent == pio); ASSERT(zl->zl_child == cio); mutex_enter(&pio->io_lock); mutex_enter(&cio->io_lock); list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); pio->io_child_count--; cio->io_parent_count--; mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); kmem_cache_free(zio_link_cache, zl); } static boolean_t zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) { boolean_t waiting = B_FALSE; mutex_enter(&zio->io_lock); ASSERT(zio->io_stall == NULL); for (int c = 0; c < ZIO_CHILD_TYPES; c++) { if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) continue; uint64_t *countp = &zio->io_children[c][wait]; if (*countp != 0) { zio->io_stage >>= 1; ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); zio->io_stall = countp; waiting = B_TRUE; break; } } mutex_exit(&zio->io_lock); return (waiting); } __attribute__((always_inline)) static inline void zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, zio_t **next_to_executep) { uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; int *errorp = &pio->io_child_error[zio->io_child_type]; mutex_enter(&pio->io_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) *errorp = zio_worst_error(*errorp, zio->io_error); pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); (*countp)--; if (*countp == 0 && pio->io_stall == countp) { zio_taskq_type_t type = pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : ZIO_TASKQ_INTERRUPT; pio->io_stall = NULL; mutex_exit(&pio->io_lock); /* * If we can tell the caller to execute this parent next, do * so. Otherwise dispatch the parent zio as its own task. * * Having the caller execute the parent when possible reduces * locking on the zio taskq's, reduces context switch * overhead, and has no recursion penalty. Note that one * read from disk typically causes at least 3 zio's: a * zio_null(), the logical zio_read(), and then a physical * zio. When the physical ZIO completes, we are able to call * zio_done() on all 3 of these zio's from one invocation of * zio_execute() by returning the parent back to * zio_execute(). Since the parent isn't executed until this * thread returns back to zio_execute(), the caller should do * so promptly. * * In other cases, dispatching the parent prevents * overflowing the stack when we have deeply nested * parent-child relationships, as we do with the "mega zio" * of writes for spa_sync(), and the chain of ZIL blocks. */ if (next_to_executep != NULL && *next_to_executep == NULL) { *next_to_executep = pio; } else { zio_taskq_dispatch(pio, type, B_FALSE); } } else { mutex_exit(&pio->io_lock); } } static void zio_inherit_child_errors(zio_t *zio, enum zio_child c) { if (zio->io_child_error[c] != 0 && zio->io_error == 0) zio->io_error = zio->io_child_error[c]; } int zio_bookmark_compare(const void *x1, const void *x2) { const zio_t *z1 = x1; const zio_t *z2 = x2; if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) return (-1); if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) return (1); if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) return (-1); if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) return (1); if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) return (-1); if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) return (1); if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) return (-1); if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) return (1); if (z1 < z2) return (-1); if (z1 > z2) return (1); return (0); } /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) * ========================================================================== */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, enum zio_flag flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) { zio_t *zio; IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0); zio = kmem_cache_alloc(zio_cache, KM_SLEEP); bzero(zio, sizeof (zio_t)); mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); list_create(&zio->io_parent_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_parent_node)); list_create(&zio->io_child_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_child_node)); metaslab_trace_init(&zio->io_alloc_list); if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) zio->io_child_type = ZIO_CHILD_GANG; else if (flags & ZIO_FLAG_DDT_CHILD) zio->io_child_type = ZIO_CHILD_DDT; else zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { zio->io_bp = (blkptr_t *)bp; zio->io_bp_copy = *bp; zio->io_bp_orig = *bp; if (type != ZIO_TYPE_WRITE || zio->io_child_type == ZIO_CHILD_DDT) zio->io_bp = &zio->io_bp_copy; /* so caller can free */ if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) pipeline |= ZIO_GANG_STAGES; } zio->io_spa = spa; zio->io_txg = txg; zio->io_done = done; zio->io_private = private; zio->io_type = type; zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; zio->io_orig_abd = zio->io_abd = data; zio->io_orig_size = zio->io_size = psize; zio->io_lsize = lsize; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_pipeline_trace = ZIO_STAGE_OPEN; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); if (zb != NULL) zio->io_bookmark = *zb; if (pio != NULL) { if (zio->io_metaslab_class == NULL) zio->io_metaslab_class = pio->io_metaslab_class; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); } taskq_init_ent(&zio->io_tqent); return (zio); } static void zio_destroy(zio_t *zio) { metaslab_trace_fini(&zio->io_alloc_list); list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); } zio_t * zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); return (zio); } zio_t * zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) { return (zio_null(NULL, spa, NULL, done, private, flags)); } static int zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, enum blk_verify_flag blk_verify, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); switch (blk_verify) { case BLK_VERIFY_HALT: dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp); zfs_panic_recover("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_LOG: zfs_dbgmsg("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_ONLY: break; } return (1); } /* * Verify the block pointer fields contain reasonable values. This means * it only contains known object types, checksum/compression identifiers, * block sizes within the maximum allowed limits, valid DVAs, etc. * * If everything checks out B_TRUE is returned. The zfs_blkptr_verify * argument controls the behavior when an invalid field is detected. * * Modes for zfs_blkptr_verify: * 1) BLK_VERIFY_ONLY (evaluate the block) * 2) BLK_VERIFY_LOG (evaluate the block and log problems) * 3) BLK_VERIFY_HALT (call zfs_panic_recover on error) */ boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, enum blk_verify_flag blk_verify) { int errors = 0; if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } } /* * Do not verify individual DVAs if the config is not trusted. This * will be done once the zio is executed in vdev_mirror_map_alloc. */ if (!spa->spa_trust_config) return (B_TRUE); if (!config_held) spa_config_enter(spa, SCL_VDEV, bp, RW_READER); else ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER)); /* * Pool-specific checks. * * Note: it would be nice to verify that the blk_birth and * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() * allows the birth time of log blocks (and dmu_sync()-ed blocks * that are in the log) to be arbitrarily large. */ for (int i = 0; i < BP_GET_NDVAS(bp); i++) { uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); if (vdevid >= spa->spa_root_vdev->vdev_children) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_hole_ops) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p DVA %u has hole VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_missing_ops) { /* * "missing" vdevs are valid during import, but we * don't have their detailed info (e.g. asize), so * we can't perform any more checks on them. */ continue; } uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); if (BP_IS_GANG(bp)) asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); if (offset + asize > vd->vdev_asize) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p DVA %u has invalid OFFSET %llu", bp, i, (longlong_t)offset); } } if (errors > 0) dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp); if (!config_held) spa_config_exit(spa, SCL_VDEV, bp); return (errors == 0); } boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp) { uint64_t vdevid = DVA_GET_VDEV(dva); if (vdevid >= spa->spa_root_vdev->vdev_children) return (B_FALSE); vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) return (B_FALSE); if (vd->vdev_ops == &vdev_hole_ops) return (B_FALSE); if (vd->vdev_ops == &vdev_missing_ops) { return (B_FALSE); } uint64_t offset = DVA_GET_OFFSET(dva); uint64_t asize = DVA_GET_ASIZE(dva); if (BP_IS_GANG(bp)) asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); if (offset + asize > vd->vdev_asize) return (B_FALSE); return (B_TRUE); } zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; (void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER, BLK_VERIFY_HALT); zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, data, size, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); return (zio); } zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && zp->zp_compress >= ZIO_COMPRESS_OFF && zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa)); zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); zio->io_ready = ready; zio->io_children_ready = children_ready; zio->io_physdone = physdone; zio->io_prop = *zp; /* * Data can be NULL if we are going to call zio_write_override() to * provide the already-allocated BP. But we may need the data to * verify a dedup hit (if requested). In this case, don't try to * dedup (just take the already-allocated BP verbatim). Encrypted * dedup blocks need data as well so we also disable dedup in this * case. */ if (data == NULL && (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) { zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; } return (zio); } zio_t * zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); return (zio); } void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); /* * We must reset the io_prop to match the values that existed * when the bp was first written by dmu_sync() keeping in mind * that nopwrite and dedup are mutually exclusive. */ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; zio->io_prop.zp_nopwrite = nopwrite; zio->io_prop.zp_copies = copies; zio->io_bp_override = bp; } void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { (void) zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_HALT); /* * The check for EMBEDDED is a performance optimization. We * process the free here (by ignoring it) rather than * putting it on the list and then processing it in zio_free_sync(). */ if (BP_IS_EMBEDDED(bp)) return; metaslab_check_free(spa, bp); /* * Frees that are for the currently-syncing txg, are not going to be * deferred, and which will not need to do a read (i.e. not GANG or * DEDUP), can be processed immediately. Otherwise, put them on the * in-memory list for later processing. * * Note that we only defer frees after zfs_sync_pass_deferred_free * when the log space map feature is disabled. [see relevant comment * in spa_sync_iterate_to_convergence()] */ if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || txg != spa->spa_syncing_txg || (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free && !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))) { bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL); } } /* * To improve performance, this function may return NULL if we were able * to do the free immediately. This avoids the cost of creating a zio * (and linking it to the parent, etc). */ zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, enum zio_flag flags) { ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); if (BP_IS_EMBEDDED(bp)) return (NULL); metaslab_check_free(spa, bp); arc_freed(spa, bp); dsl_scan_freed(spa, bp); if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) { /* * GANG and DEDUP blocks can induce a read (for the gang block * header, or the DDT), so issue them asynchronously so that * this thread is not tied up. */ enum zio_stage stage = ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC; return (zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage)); } else { metaslab_free(spa, bp, txg, B_FALSE); return (NULL); } } zio_t * zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; (void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER, BLK_VERIFY_HALT); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); /* * A claim is an allocation of a specific block. Claims are needed * to support immediate writes in the intent log. The issue is that * immediate writes contain committed data, but in a txg that was * *not* committed. Upon opening the pool after an unclean shutdown, * the intent log claims all blocks that contain immediate write data * so that the SPA knows they're in use. * * All claims *must* be resolved in the first txg -- before the SPA * starts allocating blocks -- so that nothing is allocated twice. * If txg == 0 we just verify that the block is claimable. */ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_min_claim_txg(spa)); ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); ASSERT0(zio->io_queued_timestamp); return (zio); } zio_t * zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; int c; if (vd->vdev_children == 0) { zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); zio->io_cmd = cmd; } else { zio = zio_null(pio, spa, NULL, NULL, NULL, flags); for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, done, private, flags)); } return (zio); } zio_t * zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, enum trim_flag trim_flags) { zio_t *zio; ASSERT0(vd->vdev_children); ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift)); ASSERT3U(size, !=, 0); zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done, private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE); zio->io_trim_flags = trim_flags; return (zio); } zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; return (zio); } zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { /* * zec checksums are necessarily destructive -- they modify * the end of the write buffer to hold the verifier/checksum. * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ abd_t *wbuf = abd_alloc_sametype(data, size); abd_copy(wbuf, data, size); zio_push_transform(zio, wbuf, size, size, NULL); } return (zio); } /* * Create a child I/O to do some work for us. */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; /* * vdev child I/Os do not propagate their error to the parent. * Therefore, for correct operation the caller *must* check for * and handle the error in the child i/o's done callback. * The only exceptions are i/os that we don't care about * (OPTIONAL or REPAIR). */ ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) || done != NULL); if (type == ZIO_TYPE_READ && bp != NULL) { /* * If we have the bp, then the child should perform the * checksum and the parent need not. This pushes error * detection as close to the leaves as possible and * eliminates redundant checksums in the interior nodes. */ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } if (vd->vdev_ops->vdev_op_leaf) { ASSERT0(vd->vdev_children); offset += VDEV_LABEL_START_SIZE; } flags |= ZIO_VDEV_CHILD_FLAGS(pio); /* * If we've decided to do a repair, the write is not speculative -- * even if the original read was. */ if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; /* * If we're creating a child I/O that is not associated with a * top-level vdev, then the child zio is not an allocating I/O. * If this is a retried I/O then we ignore it since we will * have already processed the original allocating I/O. */ if (flags & ZIO_FLAG_IO_ALLOCATING && (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { ASSERT(pio->io_metaslab_class != NULL); ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled); ASSERT(type == ZIO_TYPE_WRITE); ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || pio->io_child_type == ZIO_CHILD_GANG); flags &= ~ZIO_FLAG_IO_ALLOCATING; } zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); zio->io_physdone = pio->io_physdone; if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) zio->io_logical->io_phys_children++; return (zio); } zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, zio_type_t type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { zio_t *zio; ASSERT(vd->vdev_ops->vdev_op_leaf); zio = zio_create(NULL, vd->vdev_spa, 0, NULL, data, size, size, done, private, type, priority, flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, vd, offset, NULL, ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); return (zio); } void zio_flush(zio_t *zio, vdev_t *vd) { zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } void zio_shrink(zio_t *zio, uint64_t size) { ASSERT3P(zio->io_executor, ==, NULL); ASSERT3U(zio->io_orig_size, ==, zio->io_size); ASSERT3U(size, <=, zio->io_size); /* * We don't shrink for raidz because of problems with the * reconstruction when reading back less than the block size. * Note, BP_IS_RAIDZ() assumes no compression. */ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); if (!BP_IS_RAIDZ(zio->io_bp)) { /* we are not doing a raw write */ ASSERT3U(zio->io_size, ==, zio->io_lsize); zio->io_orig_size = zio->io_size = zio->io_lsize = size; } } /* * ========================================================================== * Prepare to read and write logical blocks * ========================================================================== */ static zio_t * zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decompress); } if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) || BP_HAS_INDIRECT_MAC_CKSUM(bp)) && zio->io_child_type == ZIO_CHILD_LOGICAL) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decrypt); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { int psize = BPE_GET_PSIZE(bp); void *data = abd_borrow_buf(zio->io_abd, psize); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; decode_embedded_bp_compressed(bp, data); abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); } if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; return (zio); } static zio_t * zio_write_bp_init(zio_t *zio) { if (!IO_IS_ALLOCATING(zio)) return (zio); ASSERT(zio->io_child_type != ZIO_CHILD_DDT); if (zio->io_bp_override) { blkptr_t *bp = zio->io_bp; zio_prop_t *zp = &zio->io_prop; ASSERT(bp->blk_birth != zio->io_txg); ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (BP_IS_EMBEDDED(bp)) return (zio); /* * If we've been overridden and nopwrite is set then * set the flag accordingly to indicate that a nopwrite * has already occurred. */ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { ASSERT(!zp->zp_dedup); ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); zio->io_flags |= ZIO_FLAG_NOPWRITE; return (zio); } ASSERT(!zp->zp_nopwrite); if (BP_IS_HOLE(bp) || !zp->zp_dedup) return (zio); ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); if (BP_GET_CHECKSUM(bp) == zp->zp_checksum && !zp->zp_encrypt) { BP_SET_DEDUP(bp, 1); zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (zio); } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } return (zio); } static zio_t * zio_write_compress(zio_t *zio) { spa_t *spa = zio->io_spa; zio_prop_t *zp = &zio->io_prop; enum zio_compress compress = zp->zp_compress; blkptr_t *bp = zio->io_bp; uint64_t lsize = zio->io_lsize; uint64_t psize = zio->io_size; int pass = 1; /* * If our children haven't all reached the ready stage, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { return (NULL); } if (!IO_IS_ALLOCATING(zio)) return (zio); if (zio->io_children_ready != NULL) { /* * Now that all our children are ready, run the callback * associated with this zio in case it wants to modify the * data to be written. */ ASSERT3U(zp->zp_level, >, 0); zio->io_children_ready(zio); } ASSERT(zio->io_child_type != ZIO_CHILD_DDT); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to * converge, it must eventually be the case that we don't * have to allocate new blocks. But compression changes * the blocksize, which forces a reallocate, and makes * convergence take longer. Therefore, after the first * few passes, stop compressing to ensure convergence. */ pass = spa_sync_pass(spa); ASSERT(zio->io_txg == spa_syncing_txg(spa)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!BP_GET_DEDUP(bp)); if (pass >= zfs_sync_pass_dont_compress) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } /* If it's a compressed write that is not raw, compress the buffer. */ if (compress != ZIO_COMPRESS_OFF && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { void *cbuf = zio_buf_alloc(lsize); psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize, zp->zp_complevel); if (psize == 0 || psize >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); } else if (!zp->zp_dedup && !zp->zp_encrypt && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { encode_embedded_bp_compressed(bp, cbuf, compress, lsize, psize); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); zio_buf_free(cbuf, lsize); bp->blk_birth = zio->io_txg; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_EMBEDDED_DATA)); return (zio); } else { /* * Round up compressed size up to the ashift * of the smallest-ashift device, and zero the tail. * This ensures that the compressed size of the BP * (and thus compressratio property) are correct, * in that we charge for the padding used to fill out * the last sector. */ ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); size_t rounded = (size_t)P2ROUNDUP(psize, 1ULL << spa->spa_min_ashift); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); psize = lsize; } else { abd_t *cdata = abd_get_from_buf(cbuf, lsize); abd_take_ownership_of_buf(cdata, B_TRUE); abd_zero_off(cdata, psize, rounded - psize); psize = rounded; zio_push_transform(zio, cdata, psize, lsize, NULL); } } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 && zp->zp_type == DMU_OT_DNODE) { /* * The DMU actually relies on the zio layer's compression * to free metadnode blocks that have had all contained * dnodes freed. As a result, even when doing a raw * receive, we must check whether the block can be compressed * to a hole. */ psize = zio_compress_data(ZIO_COMPRESS_EMPTY, zio->io_abd, NULL, lsize, zp->zp_complevel); if (psize == 0 || psize >= lsize) compress = ZIO_COMPRESS_OFF; } else { ASSERT3U(psize, !=, 0); } /* * The final pass of spa_sync() must be all rewrites, but the first * few passes offer a trade-off: allocating blocks defers convergence, * but newly allocated blocks are sequential, so they can be written * to disk faster. Therefore, we allow the first few passes of * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { VERIFY3U(psize, !=, 0); enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { BP_ZERO(bp); zio->io_pipeline = ZIO_WRITE_PIPELINE; } if (psize == 0) { if (zio->io_bp_orig.blk_birth != 0 && spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_BIRTH(bp, zio->io_txg, 0); } zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } else { ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, compress); BP_SET_CHECKSUM(bp, zp->zp_checksum); BP_SET_DEDUP(bp, zp->zp_dedup); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); if (zp->zp_dedup) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(!zp->zp_encrypt || DMU_OT_IS_ENCRYPTED(zp->zp_type)); zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; } if (zp->zp_nopwrite) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } return (zio); } static zio_t * zio_free_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) { if (BP_GET_DEDUP(bp)) zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; } ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); return (zio); } /* * ========================================================================== * Execute the I/O pipeline * ========================================================================== */ static void zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; int flags = (cutinline ? TQ_FRONT : 0); /* * If we're a config writer or a probe, the normal issue and * interrupt threads may all be blocked waiting for the config lock. * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. */ if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) t = ZIO_TYPE_NULL; /* * A similar issue exists for the L2ARC write thread until L2ARC 2.0. */ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) t = ZIO_TYPE_NULL; /* * If this is a high priority I/O, then use the high priority taskq if * available. */ if ((zio->io_priority == ZIO_PRIORITY_NOW || zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) && spa->spa_zio_taskq[t][q + 1].stqs_count != 0) q++; ASSERT3U(q, <, ZIO_TASKQ_TYPES); /* * NB: We are assuming that the zio can only be dispatched * to a single taskq at a time. It would be a grievous error * to dispatch the zio to another taskq at the same time. */ ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, flags, &zio->io_tqent); } static boolean_t zio_taskq_member(zio_t *zio, zio_taskq_type_t q) { spa_t *spa = zio->io_spa; taskq_t *tq = taskq_of_curthread(); for (zio_type_t t = 0; t < ZIO_TYPES; t++) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t i; for (i = 0; i < tqs->stqs_count; i++) { if (tqs->stqs_taskq[i] == tq) return (B_TRUE); } } return (B_FALSE); } static zio_t * zio_issue_async(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } void zio_interrupt(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } void zio_delay_interrupt(zio_t *zio) { /* * The timeout_generic() function isn't defined in userspace, so * rather than trying to implement the function, the zio delay * functionality has been disabled for userspace builds. */ #ifdef _KERNEL /* * If io_target_timestamp is zero, then no delay has been registered * for this IO, thus jump to the end of this function and "skip" the * delay; issuing it directly to the zio layer. */ if (zio->io_target_timestamp != 0) { hrtime_t now = gethrtime(); if (now >= zio->io_target_timestamp) { /* * This IO has already taken longer than the target * delay to complete, so we don't want to delay it * any longer; we "miss" the delay and issue it * directly to the zio layer. This is likely due to * the target latency being set to a value less than * the underlying hardware can satisfy (e.g. delay * set to 1ms, but the disks take 10ms to complete an * IO request). */ DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, hrtime_t, now); zio_interrupt(zio); } else { taskqid_t tid; hrtime_t diff = zio->io_target_timestamp - now; clock_t expire_at_tick = ddi_get_lbolt() + NSEC_TO_TICK(diff); DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, hrtime_t, now, hrtime_t, diff); if (NSEC_TO_TICK(diff) == 0) { /* Our delay is less than a jiffy - just spin */ zfs_sleep_until(zio->io_target_timestamp); zio_interrupt(zio); } else { /* * Use taskq_dispatch_delay() in the place of * OpenZFS's timeout_generic(). */ tid = taskq_dispatch_delay(system_taskq, (task_func_t *)zio_interrupt, zio, TQ_NOSLEEP, expire_at_tick); if (tid == TASKQID_INVALID) { /* * Couldn't allocate a task. Just * finish the zio without a delay. */ zio_interrupt(zio); } } } return; } #endif DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); zio_interrupt(zio); } static void zio_deadman_impl(zio_t *pio, int ziodepth) { zio_t *cio, *cio_next; zio_link_t *zl = NULL; vdev_t *vd = pio->io_vd; if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) { vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL; zbookmark_phys_t *zb = &pio->io_bookmark; uint64_t delta = gethrtime() - pio->io_timestamp; uint64_t failmode = spa_get_deadman_failmode(pio->io_spa); zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu " "delta=%llu queued=%llu io=%llu " "path=%s last=%llu " "type=%d priority=%d flags=0x%x " "stage=0x%x pipeline=0x%x pipeline-trace=0x%x " "objset=%llu object=%llu level=%llu blkid=%llu " "offset=%llu size=%llu error=%d", ziodepth, pio, pio->io_timestamp, delta, pio->io_delta, pio->io_delay, vd ? vd->vdev_path : "NULL", vq ? vq->vq_io_complete_ts : 0, pio->io_type, pio->io_priority, pio->io_flags, pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace, zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, pio->io_offset, pio->io_size, pio->io_error); - zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, + (void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, pio->io_spa, vd, zb, pio, 0, 0); if (failmode == ZIO_FAILURE_MODE_CONTINUE && taskq_empty_ent(&pio->io_tqent)) { zio_interrupt(pio); } } mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); zio_deadman_impl(cio, ziodepth + 1); } mutex_exit(&pio->io_lock); } /* * Log the critical information describing this zio and all of its children * using the zfs_dbgmsg() interface then post deadman event for the ZED. */ void zio_deadman(zio_t *pio, char *tag) { spa_t *spa = pio->io_spa; char *name = spa_name(spa); if (!zfs_deadman_enabled || spa_suspended(spa)) return; zio_deadman_impl(pio, 0); switch (spa_get_deadman_failmode(spa)) { case ZIO_FAILURE_MODE_WAIT: zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name); break; case ZIO_FAILURE_MODE_CONTINUE: zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name); break; case ZIO_FAILURE_MODE_PANIC: fm_panic("%s determined I/O to pool '%s' is hung.", tag, name); break; } } /* * Execute the I/O pipeline until one of the following occurs: * (1) the I/O completes; (2) the pipeline stalls waiting for * dependent child I/Os; (3) the I/O issues, so we're waiting * for an I/O completion interrupt; (4) the I/O is delegated by * vdev-level caching or aggregation; (5) the I/O is deferred * due to vdev-level queueing; (6) the I/O is handed off to * another thread. In all cases, the pipeline stops whenever * there's no CPU work; it never burns a thread in cv_wait_io(). * * There's no locking on io_stage because there's no legitimate way * for multiple threads to be attempting to process the same I/O. */ static zio_pipe_stage_t *zio_pipeline[]; /* * zio_execute() is a wrapper around the static function * __zio_execute() so that we can force __zio_execute() to be * inlined. This reduces stack overhead which is important * because __zio_execute() is called recursively in several zio * code paths. zio_execute() itself cannot be inlined because * it is externally visible. */ void zio_execute(zio_t *zio) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); __zio_execute(zio); spl_fstrans_unmark(cookie); } /* * Used to determine if in the current context the stack is sized large * enough to allow zio_execute() to be called recursively. A minimum * stack size of 16K is required to avoid needing to re-dispatch the zio. */ static boolean_t zio_execute_stack_check(zio_t *zio) { #if !defined(HAVE_LARGE_STACKS) dsl_pool_t *dp = spa_get_dsl(zio->io_spa); /* Executing in txg_sync_thread() context. */ if (dp && curthread == dp->dp_tx.tx_sync_thread) return (B_TRUE); /* Pool initialization outside of zio_taskq context. */ if (dp && spa_is_initializing(dp->dp_spa) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH)) return (B_TRUE); #endif /* HAVE_LARGE_STACKS */ return (B_FALSE); } __attribute__((always_inline)) static inline void __zio_execute(zio_t *zio) { ASSERT3U(zio->io_queued_timestamp, >, 0); while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; zio->io_executor = curthread; ASSERT(!MUTEX_HELD(&zio->io_lock)); ASSERT(ISP2(stage)); ASSERT(zio->io_stall == NULL); do { stage <<= 1; } while ((stage & pipeline) == 0); ASSERT(stage <= ZIO_STAGE_DONE); /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O, * or may wait for an I/O that needs an interrupt thread * to complete, issue async to avoid deadlock. * * For VDEV_IO_START, we cut in line so that the io will * be sent to disk promptly. */ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } /* * If the current context doesn't have large enough stacks * the zio must be issued asynchronously to prevent overflow. */ if (zio_execute_stack_check(zio)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } zio->io_stage = stage; zio->io_pipeline_trace |= zio->io_stage; /* * The zio pipeline stage returns the next zio to execute * (typically the same as this one), or NULL if we should * stop. */ zio = zio_pipeline[highbit64(stage) - 1](zio); if (zio == NULL) return; } } /* * ========================================================================== * Initiate I/O, either sync or async * ========================================================================== */ int zio_wait(zio_t *zio) { /* * Some routines, like zio_free_sync(), may return a NULL zio * to avoid the performance overhead of creating and then destroying * an unneeded zio. For the callers' simplicity, we accept a NULL * zio and ignore it. */ if (zio == NULL) return (0); long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms); int error; ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN); ASSERT3P(zio->io_executor, ==, NULL); zio->io_waiter = curthread; ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); __zio_execute(zio); mutex_enter(&zio->io_lock); while (zio->io_executor != NULL) { error = cv_timedwait_io(&zio->io_cv, &zio->io_lock, ddi_get_lbolt() + timeout); if (zfs_deadman_enabled && error == -1 && gethrtime() - zio->io_queued_timestamp > spa_deadman_ziotime(zio->io_spa)) { mutex_exit(&zio->io_lock); timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms); zio_deadman(zio, FTAG); mutex_enter(&zio->io_lock); } } mutex_exit(&zio->io_lock); error = zio->io_error; zio_destroy(zio); return (error); } void zio_nowait(zio_t *zio) { /* * See comment in zio_wait(). */ if (zio == NULL) return; ASSERT3P(zio->io_executor, ==, NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && zio_unique_parent(zio) == NULL) { zio_t *pio; /* * This is a logical async I/O with no parent to wait for it. * We add it to the spa_async_root_zio "Godfather" I/O which * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; kpreempt_disable(); pio = spa->spa_async_zio_root[CPU_SEQID]; kpreempt_enable(); zio_add_child(pio, zio); } ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); __zio_execute(zio); } /* * ========================================================================== * Reexecute, cancel, or suspend/resume failed I/O * ========================================================================== */ static void zio_reexecute(zio_t *pio) { zio_t *cio, *cio_next; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); ASSERT(pio->io_gang_leader == NULL); ASSERT(pio->io_gang_tree == NULL); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_pipeline_trace = 0; pio->io_error = 0; for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; if (IO_IS_ALLOCATING(pio)) BP_ZERO(pio->io_bp); /* * As we reexecute pio's children, new children could be created. * New children go to the head of pio's io_child_list, however, * so we will (correctly) not reexecute them. The key is that * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ zio_link_t *zl = NULL; mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); zio_reexecute(cio); mutex_enter(&pio->io_lock); } mutex_exit(&pio->io_lock); /* * Now that all children have been reexecuted, execute the parent. * We don't reexecute "The Godfather" I/O here as it's the * responsibility of the caller to wait on it. */ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { pio->io_queued_timestamp = gethrtime(); __zio_execute(pio); } } void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) { if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) fm_panic("Pool '%s' has encountered an uncorrectable I/O " "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O " "failure and has been suspended.\n", spa_name(spa)); - zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, + (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, NULL, 0, 0); mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspend_zio_root == NULL) spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); spa->spa_suspended = reason; if (zio != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio_unique_parent(zio) == NULL); ASSERT(zio->io_stage == ZIO_STAGE_DONE); zio_add_child(spa->spa_suspend_zio_root, zio); } mutex_exit(&spa->spa_suspend_lock); } int zio_resume(spa_t *spa) { zio_t *pio; /* * Reexecute all previously suspended i/o. */ mutex_enter(&spa->spa_suspend_lock); spa->spa_suspended = ZIO_SUSPEND_NONE; cv_broadcast(&spa->spa_suspend_cv); pio = spa->spa_suspend_zio_root; spa->spa_suspend_zio_root = NULL; mutex_exit(&spa->spa_suspend_lock); if (pio == NULL) return (0); zio_reexecute(pio); return (zio_wait(pio)); } void zio_resume_wait(spa_t *spa) { mutex_enter(&spa->spa_suspend_lock); while (spa_suspended(spa)) cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); mutex_exit(&spa->spa_suspend_lock); } /* * ========================================================================== * Gang blocks. * * A gang block is a collection of small blocks that looks to the DMU * like one large block. When zio_dva_allocate() cannot find a block * of the requested size, due to either severe fragmentation or the pool * being nearly full, it calls zio_write_gang_block() to construct the * block from smaller fragments. * * A gang block consists of a gang header (zio_gbh_phys_t) and up to * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like * an indirect block: it's an array of block pointers. It consumes * only one sector and hence is allocatable regardless of fragmentation. * The gang header's bps point to its gang members, which hold the data. * * Gang blocks are self-checksumming, using the bp's * as the verifier to ensure uniqueness of the SHA256 checksum. * Critically, the gang block bp's blk_cksum is the checksum of the data, * not the gang header. This ensures that data block signatures (needed for * deduplication) are independent of how the block is physically stored. * * Gang blocks can be nested: a gang member may itself be a gang block. * Thus every gang block is a tree in which root and all interior nodes are * gang headers, and the leaves are normal blocks that contain user data. * The root of the gang tree is called the gang leader. * * To perform any operation (read, rewrite, free, claim) on a gang block, * zio_gang_assemble() first assembles the gang tree (minus data leaves) * in the io_gang_tree field of the original logical i/o by recursively * reading the gang leader and all gang headers below it. This yields * an in-core tree containing the contents of every gang header and the * bps for every constituent of the gang block. * * With the gang tree now assembled, zio_gang_issue() just walks the gang tree * and invokes a callback on each bp. To free a gang block, zio_gang_issue() * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). * zio_read_gang() is a wrapper around zio_read() that omits reading gang * headers, since we already have those in io_gang_tree. zio_rewrite_gang() * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() * of the gang header plus zio_checksum_compute() of the data to update the * gang header's blk_cksum as described above. * * The two-phase assemble/issue model solves the problem of partial failure -- * what if you'd freed part of a gang block but then couldn't read the * gang header for another part? Assembling the entire gang tree first * ensures that all the necessary gang header I/O has succeeded before * starting the actual work of free, claim, or write. Once the gang tree * is assembled, free and claim are in-memory operations that cannot fail. * * In the event that a gang write fails, zio_dva_unallocate() walks the * gang tree to immediately free (i.e. insert back into the space map) * everything we've allocated. This ensures that we don't get ENOSPC * errors during repeated suspend/resume cycles due to a flaky device. * * Gang rewrites only happen during sync-to-convergence. If we can't assemble * the gang tree, we won't modify the block, so we can safely defer the free * (knowing that the block is still intact). If we *can* assemble the gang * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free * each constituent bp and we can allocate a new block on the next sync pass. * * In all cases, the gang tree allows complete recovery from partial failure. * ========================================================================== */ static void zio_gang_issue_func_done(zio_t *zio) { abd_put(zio->io_abd); } static zio_t * zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { if (gn != NULL) return (pio); return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } static zio_t * zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { zio_t *zio; if (gn != NULL) { abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will * compute a new data checksum, so we do that here. The one * exception is the gang leader: the pipeline already computed * its data checksum because that stage precedes gang assembly. * (Presently, nothing actually uses interior data checksums; * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { abd_t *buf = abd_get_offset(data, offset); zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), buf, BP_GET_PSIZE(bp)); abd_put(buf); } /* * If we are here to damage data for testing purposes, * leave the GBH alone so that we can detect the damage. */ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } return (zio); } /* ARGSUSED */ static zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ZIO_GANG_CHILD_FLAGS(pio)); if (zio == NULL) { zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)); } return (zio); } /* ARGSUSED */ static zio_t * zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); } static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { NULL, zio_read_gang, zio_rewrite_gang, zio_free_gang, zio_claim_gang, NULL }; static void zio_gang_tree_assemble_done(zio_t *zio); static zio_gang_node_t * zio_gang_node_alloc(zio_gang_node_t **gnpp) { zio_gang_node_t *gn; ASSERT(*gnpp == NULL); gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); *gnpp = gn; return (gn); } static void zio_gang_node_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) ASSERT(gn->gn_child[g] == NULL); zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); kmem_free(gn, sizeof (*gn)); *gnpp = NULL; } static void zio_gang_tree_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; if (gn == NULL) return; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) zio_gang_tree_free(&gn->gn_child[g]); zio_gang_node_free(gnpp); } static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void zio_gang_tree_assemble_done(zio_t *zio) { zio_t *gio = zio->io_gang_leader; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; ASSERT(gio == zio_unique_parent(zio)); ASSERT(zio->io_child_count == 0); if (zio->io_error) return; /* this ABD was created from a linear buf in zio_gang_tree_assemble */ if (BP_SHOULD_BYTESWAP(bp)) byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); abd_put(zio->io_abd); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) continue; zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); } } static void zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, uint64_t offset) { zio_t *gio = pio->io_gang_leader; zio_t *zio; ASSERT(BP_IS_GANG(bp) == !!gn); ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); /* * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, offset); offset += BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree) ASSERT3U(gio->io_size, ==, offset); if (zio != pio) zio_nowait(zio); } static zio_t * zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); return (zio); } static zio_t * zio_gang_issue(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, 0); else zio_gang_tree_free(&zio->io_gang_tree); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } static void zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; zio_t *gio __maybe_unused = zio->io_gang_leader; if (BP_IS_HOLE(zio->io_bp)) return; ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { ASSERT(DVA_GET_GANG(&pdva[d])); asize = DVA_GET_ASIZE(&pdva[d]); asize += DVA_GET_ASIZE(&cdva[d]); DVA_SET_ASIZE(&pdva[d], asize); } mutex_exit(&pio->io_lock); } static void zio_write_gang_done(zio_t *zio) { /* * The io_abd field will be NULL for a zio with no data. The io_flags * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't * check for it here as it is cleared in zio_ready. */ if (zio->io_abd != NULL) abd_put(zio->io_abd); } static zio_t * zio_write_gang_block(zio_t *pio) { spa_t *spa = pio->io_spa; metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; int copies = gio->io_prop.zp_copies; int gbh_copies; zio_prop_t zp; int error; boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); /* * encrypted blocks need DVA[2] free so encrypted gang headers can't * have a third copy. */ gbh_copies = MIN(copies + 1, spa_max_replication(spa)); if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP) gbh_copies = SPA_DVAS_PER_BP - 1; int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); flags |= METASLAB_ASYNC_ALLOC; VERIFY(zfs_refcount_held(&mc->mc_alloc_slots[pio->io_allocator], pio)); /* * The logical zio has already placed a reservation for * 'copies' allocation slots but gang blocks may require * additional copies. These additional copies * (i.e. gbh_copies - copies) are guaranteed to succeed * since metaslab_class_throttle_reserve() always allows * additional reservations for gang blocks. */ VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, pio->io_allocator, pio, flags)); } error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, &pio->io_alloc_list, pio, pio->io_allocator); if (error) { if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); /* * If we failed to allocate the gang block header then * we remove any additional allocation reservations that * we placed here. The original reservation will * be removed when the logical I/O goes to the ready * stage. */ metaslab_class_throttle_unreserve(mc, gbh_copies - copies, pio->io_allocator, pio); } pio->io_error = error; return (pio); } if (pio == gio) { gnpp = &gio->io_gang_tree; } else { gnpp = pio->io_private; ASSERT(pio->io_ready == zio_write_gang_member_ready); } gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; bzero(gbh, SPA_GANGBLOCKSIZE); gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * Create and nowait the gang children. */ for (int g = 0; resid != 0; resid -= lsize, g++) { lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), SPA_MINBLOCKSIZE); ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_complevel = gio->io_prop.zp_complevel; zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; zp.zp_dedup = B_FALSE; zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; zp.zp_encrypt = gio->io_prop.zp_encrypt; zp.zp_byteorder = gio->io_prop.zp_byteorder; bzero(zp.zp_salt, ZIO_DATA_SALT_LEN); bzero(zp.zp_iv, ZIO_DATA_IV_LEN); bzero(zp.zp_mac, ZIO_DATA_MAC_LEN); zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], has_data ? abd_get_offset(pio->io_abd, pio->io_size - resid) : NULL, lsize, lsize, &zp, zio_write_gang_member_ready, NULL, NULL, zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); /* * Gang children won't throttle but we should * account for their work, so reserve an allocation * slot for them here. */ VERIFY(metaslab_class_throttle_reserve(mc, zp.zp_copies, cio->io_allocator, cio, flags)); } zio_nowait(cio); } /* * Set pio's pipeline to just wait for zio to finish. */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; /* * We didn't allocate this bp, so make sure it doesn't get unmarked. */ pio->io_flags &= ~ZIO_FLAG_FASTWRITE; zio_nowait(zio); return (pio); } /* * The zio_nop_write stage in the pipeline determines if allocating a * new bp is necessary. The nopwrite feature can handle writes in * either syncing or open context (i.e. zil writes) and as a result is * mutually exclusive with dedup. * * By leveraging a cryptographically secure checksum, such as SHA256, we * can compare the checksums of the new data and the old to determine if * allocating a new block is required. Note that our requirements for * cryptographic strength are fairly weak: there can't be any accidental * hash collisions, but we don't need to be secure against intentional * (malicious) collisions. To trigger a nopwrite, you have to be able * to write the file to begin with, and triggering an incorrect (hash * collision) nopwrite is no worse than simply writing to the file. * That said, there are no known attacks against the checksum algorithms * used for nopwrite, assuming that the salt and the checksums * themselves remain secret. */ static zio_t * zio_nop_write(zio_t *zio) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; zio_prop_t *zp = &zio->io_prop; ASSERT(BP_GET_LEVEL(bp) == 0); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(zp->zp_nopwrite); ASSERT(!zp->zp_dedup); ASSERT(zio->io_bp_override == NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Check to see if the original bp and the new bp have matching * characteristics (i.e. same checksum, compression algorithms, etc). * If they don't then just continue with the pipeline which will * allocate a new bp. */ if (BP_IS_HOLE(bp_orig) || !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) || BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) || BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || zp->zp_copies != BP_GET_NDVAS(bp_orig)) return (zio); /* * If the checksums match then reset the pipeline so that we * avoid allocating a new bp and issuing any I/O. */ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, sizeof (uint64_t)) == 0); /* * If we're overwriting a block that is currently on an * indirect vdev, then ignore the nopwrite request and * allow a new block to be allocated on a concrete vdev. */ spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER); vdev_t *tvd = vdev_lookup_top(zio->io_spa, DVA_GET_VDEV(&bp->blk_dva[0])); if (tvd->vdev_ops == &vdev_indirect_ops) { spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); return (zio); } spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); *bp = *bp_orig; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_flags |= ZIO_FLAG_NOPWRITE; } return (zio); } /* * ========================================================================== * Dedup * ========================================================================== */ static void zio_ddt_child_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp; zio_t *pio = zio_unique_parent(zio); mutex_enter(&pio->io_lock); ddp = ddt_phys_select(dde, bp); if (zio->io_error == 0) ddt_phys_clear(ddp); /* this ddp doesn't need repair */ if (zio->io_error == 0 && dde->dde_repair_abd == NULL) dde->dde_repair_abd = zio->io_abd; else abd_free(zio->io_abd); mutex_exit(&pio->io_lock); } static zio_t * zio_ddt_read_start(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp); ddt_phys_t *ddp = dde->dde_phys; ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); blkptr_t blk; ASSERT(zio->io_vsd == NULL); zio->io_vsd = dde; if (ddp_self == NULL) return (zio); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) continue; ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, abd_alloc_for_io(zio->io_size, B_TRUE), zio->io_size, zio_ddt_child_read_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (zio); } zio_nowait(zio_read(zio, zio->io_spa, bp, zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); return (zio); } static zio_t * zio_ddt_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_vsd; if (ddt == NULL) { ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); return (zio); } if (dde == NULL) { zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } if (dde->dde_repair_abd != NULL) { abd_copy(zio->io_abd, dde->dde_repair_abd, zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); zio->io_vsd = NULL; } ASSERT(zio->io_vsd == NULL); return (zio); } static boolean_t zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) { spa_t *spa = zio->io_spa; boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW); ASSERT(!(zio->io_bp_override && do_raw)); /* * Note: we compare the original data, not the transformed data, * because when zio->io_bp is an override bp, we will not have * pushed the I/O transforms. That's an important optimization * because otherwise we'd compress/encrypt all dmu_sync() data twice. * However, we should never get a raw, override zio so in these * cases we can compare the io_abd directly. This is useful because * it allows us to do dedup verification even if we don't have access * to the original data (for instance, if the encryption keys aren't * loaded). */ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { zio_t *lio = dde->dde_lead_zio[p]; if (lio != NULL && do_raw) { return (lio->io_size != zio->io_size || abd_cmp(zio->io_abd, lio->io_abd) != 0); } else if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); } } for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ddt_phys_t *ddp = &dde->dde_phys[p]; if (ddp->ddp_phys_birth != 0 && do_raw) { blkptr_t blk = *zio->io_bp; uint64_t psize; abd_t *tmpabd; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); psize = BP_GET_PSIZE(&blk); if (psize != zio->io_size) return (B_TRUE); ddt_exit(ddt); tmpabd = abd_alloc_for_io(psize, B_TRUE); error = zio_wait(zio_read(NULL, spa, &blk, tmpabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_RAW, &zio->io_bookmark)); if (error == 0) { if (abd_cmp(tmpabd, zio->io_abd) != 0) error = SET_ERROR(ENOENT); } abd_free(tmpabd); ddt_enter(ddt); return (error != 0); } else if (ddp->ddp_phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); if (BP_GET_LSIZE(&blk) != zio->io_orig_size) return (B_TRUE); ddt_exit(ddt); error = arc_read(NULL, spa, &blk, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zio->io_bookmark); if (error == 0) { if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data, zio->io_orig_size) != 0) error = SET_ERROR(ENOENT); arc_buf_destroy(abuf, &abuf); } ddt_enter(ddt); return (error != 0); } } return (B_FALSE); } static void zio_ddt_child_write_ready(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; zio_t *pio; if (zio->io_error) return; ddt_enter(ddt); ASSERT(dde->dde_lead_zio[p] == zio); ddt_phys_fill(ddp, zio->io_bp); zio_link_t *zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); ddt_exit(ddt); } static void zio_ddt_child_write_done(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_enter(ddt); ASSERT(ddp->ddp_refcnt == 0); ASSERT(dde->dde_lead_zio[p] == zio); dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { zio_link_t *zl = NULL; while (zio_walk_parents(zio, &zl) != NULL) ddt_phys_addref(ddp); } else { ddt_phys_clear(ddp); } ddt_exit(ddt); } static zio_t * zio_ddt_write(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; int p = zp->zp_copies; zio_t *cio = NULL; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_TRUE); ddp = &dde->dde_phys[p]; if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { /* * If we're using a weak checksum, upgrade to a strong checksum * and try again. If we're already using a strong checksum, * we can't resolve it, so just convert to an ordinary write. * (And automatically e-mail a paper to Nature?) */ if (!(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) { zp->zp_checksum = spa_dedup_checksum(spa); zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; BP_ZERO(bp); } else { zp->zp_dedup = B_FALSE; BP_SET_DEDUP(bp, B_FALSE); } ASSERT(!BP_GET_DEDUP(bp)); zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (zio); } if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { if (ddp->ddp_phys_birth != 0) ddt_bp_fill(ddp, bp, txg); if (dde->dde_lead_zio[p] != NULL) zio_add_child(zio, dde->dde_lead_zio[p]); else ddt_phys_addref(ddp); } else if (zio->io_bp_override) { ASSERT(bp->blk_birth == txg); ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ddt_phys_fill(ddp, bp); ddt_phys_addref(ddp); } else { cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[p] = cio; } ddt_exit(ddt); zio_nowait(cio); return (zio); } ddt_entry_t *freedde; /* for debugging */ static zio_t * zio_ddt_free(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); freedde = dde = ddt_lookup(ddt, bp, B_TRUE); if (dde) { ddp = ddt_phys_select(dde, bp); if (ddp) ddt_phys_decref(ddp); } ddt_exit(ddt); return (zio); } /* * ========================================================================== * Allocate and free blocks * ========================================================================== */ static zio_t * zio_io_to_allocate(spa_t *spa, int allocator) { zio_t *zio; ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator])); zio = avl_first(&spa->spa_alloc_trees[allocator]); if (zio == NULL) return (NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Try to place a reservation for this zio. If we're unable to * reserve then we throttle. */ ASSERT3U(zio->io_allocator, ==, allocator); if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) { return (NULL); } avl_remove(&spa->spa_alloc_trees[allocator], zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); return (zio); } static zio_t * zio_dva_throttle(zio_t *zio) { spa_t *spa = zio->io_spa; zio_t *nio; metaslab_class_t *mc; /* locate an appropriate allocation class */ mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || !mc->mc_alloc_throttle_enabled || zio->io_child_type == ZIO_CHILD_GANG || zio->io_flags & ZIO_FLAG_NODATA) { return (zio); } ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); zbookmark_phys_t *bm = &zio->io_bookmark; /* * We want to try to use as many allocators as possible to help improve * performance, but we also want logically adjacent IOs to be physically * adjacent to improve sequential read performance. We chunk each object * into 2^20 block regions, and then hash based on the objset, object, * level, and region to accomplish both of these goals. */ zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]); ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio->io_metaslab_class = mc; avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio); nio = zio_io_to_allocate(spa, zio->io_allocator); mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]); return (nio); } static void zio_allocate_dispatch(spa_t *spa, int allocator) { zio_t *zio; mutex_enter(&spa->spa_alloc_locks[allocator]); zio = zio_io_to_allocate(spa, allocator); mutex_exit(&spa->spa_alloc_locks[allocator]); if (zio == NULL) return; ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); ASSERT0(zio->io_error); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); } static zio_t * zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; metaslab_class_t *mc; blkptr_t *bp = zio->io_bp; int error; int flags = 0; if (zio->io_gang_leader == NULL) { ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; } ASSERT(BP_IS_HOLE(bp)); ASSERT0(BP_GET_NDVAS(bp)); ASSERT3U(zio->io_prop.zp_copies, >, 0); ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0; if (zio->io_flags & ZIO_FLAG_NODATA) flags |= METASLAB_DONT_THROTTLE; if (zio->io_flags & ZIO_FLAG_GANG_CHILD) flags |= METASLAB_GANG_CHILD; if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) flags |= METASLAB_ASYNC_ALLOC; /* * if not already chosen, locate an appropriate allocation class */ mc = zio->io_metaslab_class; if (mc == NULL) { mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); zio->io_metaslab_class = mc; } error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); /* * Fallback to normal class when an alloc class is full */ if (error == ENOSPC && mc != spa_normal_class(spa)) { /* * If throttling, transfer reservation over to normal class. * The io_allocator slot can remain the same even though we * are switching classes. */ if (mc->mc_alloc_throttle_enabled && (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) { metaslab_class_throttle_unreserve(mc, zio->io_prop.zp_copies, zio->io_allocator, zio); zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; mc = spa_normal_class(spa); VERIFY(metaslab_class_throttle_reserve(mc, zio->io_prop.zp_copies, zio->io_allocator, zio, flags | METASLAB_MUST_RESERVE)); } else { mc = spa_normal_class(spa); } zio->io_metaslab_class = mc; error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); } if (error != 0) { zfs_dbgmsg("%s: metaslab allocation failure: zio %px, " "size %llu, error %d", spa_name(spa), zio, zio->io_size, error); if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) return (zio_write_gang_block(zio)); zio->io_error = error; } return (zio); } static zio_t * zio_dva_free(zio_t *zio) { metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); return (zio); } static zio_t * zio_dva_claim(zio_t *zio) { int error; error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); if (error) zio->io_error = error; return (zio); } /* * Undo an allocation. This is used by zio_done() when an I/O fails * and we want to give back the block we just allocated. * This handles both normal blocks and gang blocks. */ static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp)) metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); if (gn != NULL) { for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { zio_dva_unallocate(zio, gn->gn_child[g], &gn->gn_gbh->zg_blkptr[g]); } } } /* * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t *slog) { int error = 1; zio_alloc_list_t io_alloc_list; ASSERT(txg > spa_syncing_txg(spa)); metaslab_trace_init(&io_alloc_list); /* * Block pointer fields are useful to metaslabs for stats and debugging. * Fill in the obvious ones before calling into metaslab_alloc(). */ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_PSIZE(new_bp, size); BP_SET_LEVEL(new_bp, 0); /* * When allocating a zil block, we don't have information about * the final destination of the block except the objset it's part * of, so we just hash the objset ID to pick the allocator to get * some parallelism. */ error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL, cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % spa->spa_alloc_count); if (error == 0) { *slog = TRUE; } else { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL, cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % spa->spa_alloc_count); if (error == 0) *slog = FALSE; } metaslab_trace_fini(&io_alloc_list); if (error == 0) { BP_SET_LSIZE(new_bp, size); BP_SET_PSIZE(new_bp, size); BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(new_bp, spa_version(spa) >= SPA_VERSION_SLIM_ZIL ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_LEVEL(new_bp, 0); BP_SET_DEDUP(new_bp, 0); BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); /* * encrypted blocks will require an IV and salt. We generate * these now since we will not be rewriting the bp at * rewrite time. */ if (os->os_encrypted) { uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t salt[ZIO_DATA_SALT_LEN]; BP_SET_CRYPT(new_bp, B_TRUE); VERIFY0(spa_crypt_get_salt(spa, dmu_objset_id(os), salt)); VERIFY0(zio_crypt_generate_iv(iv)); zio_crypt_encode_params_bp(new_bp, salt, iv); } } else { zfs_dbgmsg("%s: zil block allocation failure: " "size %llu, error %d", spa_name(spa), size, error); } return (error); } /* * ========================================================================== * Read and write to physical devices * ========================================================================== */ /* * Issue an I/O to the underlying vdev. Typically the issue pipeline * stops after this stage and will resume upon I/O completion. * However, there are instances where the vdev layer may need to * continue the pipeline when an I/O was not issued. Since the I/O * that was sent to the vdev layer might be different than the one * currently active in the pipeline (see vdev_queue_io()), we explicitly * force the underlying vdev layers to call either zio_execute() or * zio_interrupt() to ensure that the pipeline continues with the correct I/O. */ static zio_t * zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; uint64_t align; spa_t *spa = zio->io_spa; zio->io_delay = 0; ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); if (vd == NULL) { if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_enter(spa, SCL_ZIO, zio, RW_READER); /* * The mirror_ops handle multiple DVAs in a single BP. */ vdev_mirror_ops.vdev_op_io_start(zio); return (NULL); } ASSERT3P(zio->io_logical, !=, zio); if (zio->io_type == ZIO_TYPE_WRITE) { ASSERT(spa->spa_trust_config); /* * Note: the code can handle other kinds of writes, * but we don't expect them. */ if (zio->io_vd->vdev_removing) { ASSERT(zio->io_flags & (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); } } align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { abd_copy(abuf, zio->io_abd, zio->io_size); abd_zero_off(abuf, zio->io_size, asize - zio->io_size); } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } /* * If this is not a physical io, make sure that it is properly aligned * before proceeding. */ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { ASSERT0(P2PHASE(zio->io_offset, align)); ASSERT0(P2PHASE(zio->io_size, align)); } else { /* * For physical writes, we allow 512b aligned writes and assume * the device will perform a read-modify-write as necessary. */ ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); } VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- * that is, we're just resilvering what we expect to resilver -- * then don't do the I/O unless zio's txg is actually in vd's DTL. * This prevents spurious resilvering. * * There are a few ways that we can end up creating these spurious * resilver i/os: * * 1. A resilver i/o will be issued if any DVA in the BP has a * dirty DTL. The mirror code will issue resilver writes to * each DVA, including the one(s) that are not on vdevs with dirty * DTLs. * * 2. With nested replication, which happens when we have a * "replacing" or "spare" vdev that's a child of a mirror or raidz. * For example, given mirror(replacing(A+B), C), it's likely that * only A is out of date (it's the new device). In this case, we'll * read from C, then use the data to resilver A+B -- but we don't * actually want to resilver B, just A. The top-level mirror has no * way to know this, so instead we just discard unnecessary repairs * as we work our way down the vdev tree. * * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc. * The same logic applies to any form of nested replication: ditto * + mirror, RAID-Z + replacing, etc. * * However, indirect vdevs point off to other vdevs which may have * DTL's, so we never bypass them. The child i/os on concrete vdevs * will be properly bypassed instead. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ vd->vdev_ops != &vdev_indirect_ops && !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); return (zio); } if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) return (zio); if ((zio = vdev_queue_io(zio)) == NULL) return (NULL); if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return (NULL); } zio->io_delay = gethrtime(); } vd->vdev_ops->vdev_op_io_start(zio); return (NULL); } static zio_t * zio_vdev_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; boolean_t unexpected_error = B_FALSE; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM); if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) vdev_cache_write(zio); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injections(vd, zio, EIO, EILSEQ); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) { if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); } else { unexpected_error = B_TRUE; } } } ops->vdev_op_io_done(zio); if (unexpected_error) VERIFY(vdev_probe(vd, zio) == NULL); return (zio); } /* * This function is used to change the priority of an existing zio that is * currently in-flight. This is used by the arc to upgrade priority in the * event that a demand read is made for a block that is currently queued * as a scrub or async read IO. Otherwise, the high priority read request * would end up having to wait for the lower priority IO. */ void zio_change_priority(zio_t *pio, zio_priority_t priority) { zio_t *cio, *cio_next; zio_link_t *zl = NULL; ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) { vdev_queue_change_io_priority(pio, priority); } else { pio->io_priority = priority; } mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); zio_change_priority(cio, priority); } mutex_exit(&pio->io_lock); } /* * For non-raidz ZIOs, we can just copy aside the bad data read from the * disk, and use that to finish the checksum ereport later. */ static void zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_buf) { /* no processing needed */ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); } /*ARGSUSED*/ void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) { void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size); abd_copy(abd, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = abd; zcr->zcr_finish = zio_vsd_default_cksum_finish; zcr->zcr_free = zio_abd_free; } static zio_t * zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (NULL); } if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_exit(zio->io_spa, SCL_ZIO, zio); if (zio->io_vsd != NULL) { zio->io_vsd_ops->vsd_free(zio); zio->io_vsd = NULL; } if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); /* * If the I/O failed, determine whether we should attempt to retry it. * * On retry, we cut in line in the issue queue, since we don't want * compression/checksumming/etc. work to prevent our (cheap) IO reissue. */ if (zio->io_error && vd == NULL && !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); return (NULL); } /* * If we got an error on a leaf device, convert it to ENXIO * if the device is not accessible at all. */ if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && !vdev_accessible(vd, zio)) zio->io_error = SET_ERROR(ENXIO); /* * If we can't write to an interior vdev (mirror or RAID-Z), * set vdev_cant_write so that we stop trying to allocate from it. */ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && vd != NULL && !vd->vdev_ops->vdev_op_leaf) { vd->vdev_cant_write = B_TRUE; } /* * If a cache flush returns ENOTSUP or ENOTTY, we know that no future * attempts will ever succeed. In this case we set a persistent * boolean flag so that we don't bother with it in the future. */ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL) vd->vdev_nowritecache = B_TRUE; if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (vd != NULL && vd->vdev_ops->vdev_op_leaf && zio->io_physdone != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); zio->io_physdone(zio->io_logical); } return (zio); } void zio_vdev_io_reissue(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_stage >>= 1; } void zio_vdev_io_redone(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); zio->io_stage >>= 1; } void zio_vdev_io_bypass(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_flags |= ZIO_FLAG_IO_BYPASS; zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; } /* * ========================================================================== * Encrypt and store encryption parameters * ========================================================================== */ /* * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for * managing the storage of encryption parameters and passing them to the * lower-level encryption functions. */ static zio_t * zio_encrypt(zio_t *zio) { zio_prop_t *zp = &zio->io_prop; spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t psize = BP_GET_PSIZE(bp); uint64_t dsobj = zio->io_bookmark.zb_objset; dmu_object_type_t ot = BP_GET_TYPE(bp); void *enc_buf = NULL; abd_t *eabd = NULL; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; /* the root zio already encrypted the data */ if (zio->io_child_type == ZIO_CHILD_GANG) return (zio); /* only ZIL blocks are re-encrypted on rewrite */ if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG) return (zio); if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) { BP_SET_CRYPT(bp, B_FALSE); return (zio); } /* if we are doing raw encryption set the provided encryption params */ if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) { ASSERT0(BP_GET_LEVEL(bp)); BP_SET_CRYPT(bp, B_TRUE); BP_SET_BYTEORDER(bp, zp->zp_byteorder); if (ot != DMU_OT_OBJSET) zio_crypt_encode_mac_bp(bp, zp->zp_mac); /* dnode blocks must be written out in the provided byteorder */ if (zp->zp_byteorder != ZFS_HOST_BYTEORDER && ot == DMU_OT_DNODE) { void *bswap_buf = zio_buf_alloc(psize); abd_t *babd = abd_get_from_buf(bswap_buf, psize); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); abd_copy_to_buf(bswap_buf, zio->io_abd, psize); dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf, psize); abd_take_ownership_of_buf(babd, B_TRUE); zio_push_transform(zio, babd, psize, psize, NULL); } if (DMU_OT_IS_ENCRYPTED(ot)) zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv); return (zio); } /* indirect blocks only maintain a cksum of the lower level MACs */ if (BP_GET_LEVEL(bp) > 0) { BP_SET_CRYPT(bp, B_TRUE); VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE, zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp), mac)); zio_crypt_encode_mac_bp(bp, mac); return (zio); } /* * Objset blocks are a special case since they have 2 256-bit MACs * embedded within them. */ if (ot == DMU_OT_OBJSET) { ASSERT0(DMU_OT_IS_ENCRYPTED(ot)); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); BP_SET_CRYPT(bp, B_TRUE); VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj, zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp))); return (zio); } /* unencrypted object types are only authenticated with a MAC */ if (!DMU_OT_IS_ENCRYPTED(ot)) { BP_SET_CRYPT(bp, B_TRUE); VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj, zio->io_abd, psize, mac)); zio_crypt_encode_mac_bp(bp, mac); return (zio); } /* * Later passes of sync-to-convergence may decide to rewrite data * in place to avoid more disk reallocations. This presents a problem * for encryption because this constitutes rewriting the new data with * the same encryption key and IV. However, this only applies to blocks * in the MOS (particularly the spacemaps) and we do not encrypt the * MOS. We assert that the zio is allocating or an intent log write * to enforce this. */ ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG); ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION)); ASSERT3U(psize, !=, 0); enc_buf = zio_buf_alloc(psize); eabd = abd_get_from_buf(enc_buf, psize); abd_take_ownership_of_buf(eabd, B_TRUE); /* * For an explanation of what encryption parameters are stored * where, see the block comment in zio_crypt.c. */ if (ot == DMU_OT_INTENT_LOG) { zio_crypt_decode_params_bp(bp, salt, iv); } else { BP_SET_CRYPT(bp, B_TRUE); } /* Perform the encryption. This should not fail */ VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt)); /* encode encryption metadata into the bp */ if (ot == DMU_OT_INTENT_LOG) { /* * ZIL blocks store the MAC in the embedded checksum, so the * transform must always be applied. */ zio_crypt_encode_mac_zil(enc_buf, mac); zio_push_transform(zio, eabd, psize, psize, NULL); } else { BP_SET_CRYPT(bp, B_TRUE); zio_crypt_encode_params_bp(bp, salt, iv); zio_crypt_encode_mac_bp(bp, mac); if (no_crypt) { ASSERT3U(ot, ==, DMU_OT_DNODE); abd_free(eabd); } else { zio_push_transform(zio, eabd, psize, psize, NULL); } } return (zio); } /* * ========================================================================== * Generate and verify checksums * ========================================================================== */ static zio_t * zio_checksum_generate(zio_t *zio) { blkptr_t *bp = zio->io_bp; enum zio_checksum checksum; if (bp == NULL) { /* * This is zio_write_phys(). * We're either generating a label checksum, or none at all. */ checksum = zio->io_prop.zp_checksum; if (checksum == ZIO_CHECKSUM_OFF) return (zio); ASSERT(checksum == ZIO_CHECKSUM_LABEL); } else { if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { ASSERT(!IO_IS_ALLOCATING(zio)); checksum = ZIO_CHECKSUM_GANG_HEADER; } else { checksum = BP_GET_CHECKSUM(bp); } } zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); return (zio); } static zio_t * zio_checksum_verify(zio_t *zio) { zio_bad_cksum_t info; blkptr_t *bp = zio->io_bp; int error; ASSERT(zio->io_vd != NULL); if (bp == NULL) { /* * This is zio_read_phys(). * We're either verifying a label checksum, or nothing at all. */ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (zio); ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); } if ((error = zio_checksum_error(zio, &info)) != 0) { zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_checksum_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, zio->io_offset, zio->io_size, NULL, &info); } } return (zio); } /* * Called by RAID-Z to ensure we don't compute the checksum twice. */ void zio_checksum_verified(zio_t *zio) { zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* * ========================================================================== * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. * An error of 0 indicates success. ENXIO indicates whole-device failure, * which may be transient (e.g. unplugged) or permanent. ECKSUM and EIO * indicate errors that are specific to one I/O, and most likely permanent. * Any other error is presumed to be worse because we weren't expecting it. * ========================================================================== */ int zio_worst_error(int e1, int e2) { static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; int r1, r2; for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) if (e1 == zio_error_rank[r1]) break; for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) if (e2 == zio_error_rank[r2]) break; return (r1 > r2 ? e1 : e2); } /* * ========================================================================== * I/O completion * ========================================================================== */ static zio_t * zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; zio_link_t *zl = NULL; if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) { return (NULL); } if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); } if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_metaslab_class != NULL); /* * We were unable to allocate anything, unreserve and * issue the next I/O to allocate. */ metaslab_class_throttle_unreserve( zio->io_metaslab_class, zio->io_prop.zp_copies, zio->io_allocator, zio); zio_allocate_dispatch(zio->io_spa, zio->io_allocator); } } mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_READY] = 1; pio = zio_walk_parents(zio, &zl); mutex_exit(&zio->io_lock); /* * As we notify zio's parents, new parents could be added. * New parents go to the head of zio's io_parent_list, however, * so we will (correctly) not notify them. The remainder of zio's * io_parent_list, from 'pio_next' onward, cannot change because * all parents must wait for us to be done before they can be done. */ for (; pio != NULL; pio = pio_next) { pio_next = zio_walk_parents(zio, &zl); zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL); } if (zio->io_flags & ZIO_FLAG_NODATA) { if (BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } } if (zio_injection_enabled && zio->io_spa->spa_syncing_txg == zio->io_txg) zio_handle_ignored_writes(zio); return (zio); } /* * Update the allocation throttle accounting. */ static void zio_dva_throttle_done(zio_t *zio) { zio_t *lio __maybe_unused = zio->io_logical; zio_t *pio = zio_unique_parent(zio); vdev_t *vd = zio->io_vd; int flags = METASLAB_ASYNC_ALLOC; ASSERT3P(zio->io_bp, !=, NULL); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); ASSERT(vd != NULL); ASSERT3P(vd, ==, vd->vdev_top); ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY)); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); /* * Parents of gang children can have two flavors -- ones that * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) * and ones that allocated the constituent blocks. The allocation * throttle needs to know the allocating parent zio so we must find * it here. */ if (pio->io_child_type == ZIO_CHILD_GANG) { /* * If our parent is a rewrite gang child then our grandparent * would have been the one that performed the allocation. */ if (pio->io_flags & ZIO_FLAG_IO_REWRITE) pio = zio_unique_parent(pio); flags |= METASLAB_GANG_CHILD; } ASSERT(IO_IS_ALLOCATING(pio)); ASSERT3P(zio, !=, zio->io_logical); ASSERT(zio->io_logical != NULL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); ASSERT(zio->io_metaslab_class != NULL); mutex_enter(&pio->io_lock); metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags, pio->io_allocator, B_TRUE); mutex_exit(&pio->io_lock); metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, pio->io_allocator, pio); /* * Call into the pipeline to see if there is more work that * needs to be done. If there is work to be done it will be * dispatched to another taskq thread. */ zio_allocate_dispatch(zio->io_spa, pio->io_allocator); } static zio_t * zio_done(zio_t *zio) { /* * Always attempt to keep stack usage minimal here since * we can be called recursively up to 19 levels deep. */ const uint64_t psize = zio->io_size; zio_t *pio, *pio_next; zio_link_t *zl = NULL; /* * If our children haven't all completed, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { return (NULL); } /* * If the allocation throttle is enabled, then update the accounting. * We only track child I/Os that are part of an allocating async * write. We must do this since the allocation is performed * by the logical I/O but the actual write is done by child I/Os. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && zio->io_child_type == ZIO_CHILD_VDEV) { ASSERT(zio->io_metaslab_class != NULL); ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); zio_dva_throttle_done(zio); } /* * If the allocation throttle is enabled, verify that * we have decremented the refcounts for every I/O that was throttled. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_bp != NULL); metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, zio->io_allocator); VERIFY(zfs_refcount_not_held( &zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator], zio)); } for (int c = 0; c < ZIO_CHILD_TYPES; c++) for (int w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) { ASSERT(zio->io_bp->blk_pad[0] == 0); ASSERT(zio->io_bp->blk_pad[1] == 0); ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || (zio->io_bp == zio_unique_parent(zio)->io_bp)); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) && zio->io_bp_override == NULL && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 || (BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp))); } if (zio->io_flags & ZIO_FLAG_NOPWRITE) VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); } /* * If there were child vdev/gang/ddt errors, they apply to us now. */ zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); zio_inherit_child_errors(zio, ZIO_CHILD_GANG); zio_inherit_child_errors(zio, ZIO_CHILD_DDT); /* * If the I/O on the transformed data was successful, generate any * checksum reports now while we still have the transformed data. */ if (zio->io_error == 0) { while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(psize, align); abd_t *adata = zio->io_abd; if (asize != psize) { adata = abd_alloc(asize, B_TRUE); abd_copy(adata, zio->io_abd, psize); abd_zero_off(adata, psize, asize - psize); } zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, adata); zfs_ereport_free_checksum(zcr); if (asize != psize) abd_free(adata); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ vdev_stat_update(zio, psize); /* * If this I/O is attached to a particular vdev is slow, exceeding * 30 seconds to complete, post an error described the I/O delay. * We ignore these errors if the device is currently unavailable. */ if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) { if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { /* * We want to only increment our slow IO counters if * the IO is valid (i.e. not if the drive is removed). * * zfs_ereport_post() will also do these checks, but * it can also ratelimit and have other failures, so we * need to increment the slow_io counters independent * of it. */ if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, zio)) { mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_slow_ios++; mutex_exit(&zio->io_vd->vdev_stat_lock); - zfs_ereport_post(FM_EREPORT_ZFS_DELAY, + (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0, 0); } } } if (zio->io_error) { /* * If this I/O is attached to a particular vdev, * generate an error message describing the I/O failure * at the block level. We ignore these errors if the * device is currently unavailable. */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { mutex_enter(&zio->io_vd->vdev_stat_lock); if (zio->io_type == ZIO_TYPE_READ) { zio->io_vd->vdev_stat.vs_read_errors++; } else if (zio->io_type == ZIO_TYPE_WRITE) { zio->io_vd->vdev_stat.vs_write_errors++; } mutex_exit(&zio->io_vd->vdev_stat_lock); - zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, + (void) zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0, 0); } if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ spa_log_error(zio->io_spa, &zio->io_bookmark); - zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, - NULL, &zio->io_bookmark, zio, 0, 0); + (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, + zio->io_spa, NULL, &zio->io_bookmark, zio, 0, 0); } } if (zio->io_error && zio == zio->io_logical) { /* * Determine whether zio should be reexecuted. This will * propagate all the way to the root via zio_notify_parent(). */ ASSERT(zio->io_vd == NULL && zio->io_bp != NULL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && !(zio->io_flags & ZIO_FLAG_CANFAIL)) { if (zio->io_error != ENOSPC) zio->io_reexecute |= ZIO_REEXECUTE_NOW; else zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; } if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_error == ENXIO && spa_load_state(zio->io_spa) == SPA_LOAD_NONE && spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; /* * Here is a possibly good place to attempt to do * either combinatorial reconstruction or error correction * based on checksums. It also might be a good place * to send out preliminary ereports before we suspend * processing. */ } /* * If there were logical child errors, they apply to us now. * We defer this until now to avoid conflating logical child * errors with errors that happened to the zio itself when * updating vdev stats and reporting FMA events above. */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); zio_gang_tree_free(&zio->io_gang_tree); /* * Godfather I/Os should never suspend. */ if ((zio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND; if (zio->io_reexecute) { /* * This is a logical I/O that wants to reexecute. * * Reexecute is top-down. When an i/o fails, if it's not * the root, it simply notifies its parent and sticks around. * The parent, seeing that it still has children in zio_done(), * does the same. This percolates all the way up to the root. * The root i/o will reexecute or suspend the entire tree. * * This approach ensures that zio_reexecute() honors * all the original i/o dependency relationships, e.g. * parents not executing until children are ready. */ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); zio->io_gang_leader = NULL; mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * "The Godfather" I/O monitors its children but is * not a true parent to them. It will track them through * the pipeline but severs its ties whenever they get into * trouble (e.g. suspended). This allows "The Godfather" * I/O to return status without blocking. */ zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { zio_remove_child(pio, zio, remove_zl); /* * This is a rare code path, so we don't * bother with "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } } if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors * upward since we haven't permanently failed yet. */ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; /* * This is a rare code path, so we don't bother with * "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { /* * We'd fail again if we reexecuted now, so suspend * until conditions improve (e.g. device comes online). */ zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR); } else { /* * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(zio->io_spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 0, &zio->io_tqent); } return (NULL); } ASSERT(zio->io_child_count == 0); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); /* * Report any checksum errors, since the I/O is complete. */ while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, NULL); zfs_ereport_free_checksum(zcr); } if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) && !(zio->io_flags & ZIO_FLAG_NOPWRITE)) { metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); } /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as * such, cannot acquire any new parents. */ if (zio->io_done) zio->io_done(zio); mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * We are done executing this zio. We may want to execute a parent * next. See the comment in zio_notify_parent(). */ zio_t *next_to_execute = NULL; zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute); } if (zio->io_waiter != NULL) { mutex_enter(&zio->io_lock); zio->io_executor = NULL; cv_broadcast(&zio->io_cv); mutex_exit(&zio->io_lock); } else { zio_destroy(zio); } return (next_to_execute); } /* * ========================================================================== * I/O pipeline definition * ========================================================================== */ static zio_pipe_stage_t *zio_pipeline[] = { NULL, zio_read_bp_init, zio_write_bp_init, zio_free_bp_init, zio_issue_async, zio_write_compress, zio_encrypt, zio_checksum_generate, zio_nop_write, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, zio_ddt_free, zio_gang_assemble, zio_gang_issue, zio_dva_throttle, zio_dva_allocate, zio_dva_free, zio_dva_claim, zio_ready, zio_vdev_io_start, zio_vdev_io_done, zio_vdev_io_assess, zio_checksum_verify, zio_done }; /* * Compare two zbookmark_phys_t's to see which we would reach first in a * pre-order traversal of the object tree. * * This is simple in every case aside from the meta-dnode object. For all other * objects, we traverse them in order (object 1 before object 2, and so on). * However, all of these objects are traversed while traversing object 0, since * the data it points to is the list of objects. Thus, we need to convert to a * canonical representation so we can compare meta-dnode bookmarks to * non-meta-dnode bookmarks. * * We do this by calculating "equivalents" for each field of the zbookmark. * zbookmarks outside of the meta-dnode use their own object and level, and * calculate the level 0 equivalent (the first L0 blkid that is contained in the * blocks this bookmark refers to) by multiplying their blkid by their span * (the number of L0 blocks contained within one block at their level). * zbookmarks inside the meta-dnode calculate their object equivalent * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use * level + 1<<31 (any value larger than a level could ever be) for their level. * This causes them to always compare before a bookmark in their object * equivalent, compare appropriately to bookmarks in other objects, and to * compare appropriately to other bookmarks in the meta-dnode. */ int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) { /* * These variables represent the "equivalent" values for the zbookmark, * after converting zbookmarks inside the meta dnode to their * normal-object equivalents. */ uint64_t zb1obj, zb2obj; uint64_t zb1L0, zb2L0; uint64_t zb1level, zb2level; if (zb1->zb_object == zb2->zb_object && zb1->zb_level == zb2->zb_level && zb1->zb_blkid == zb2->zb_blkid) return (0); IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT); IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT); /* * BP_SPANB calculates the span in blocks. */ zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); if (zb1->zb_object == DMU_META_DNODE_OBJECT) { zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb1L0 = 0; zb1level = zb1->zb_level + COMPARE_META_LEVEL; } else { zb1obj = zb1->zb_object; zb1level = zb1->zb_level; } if (zb2->zb_object == DMU_META_DNODE_OBJECT) { zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb2L0 = 0; zb2level = zb2->zb_level + COMPARE_META_LEVEL; } else { zb2obj = zb2->zb_object; zb2level = zb2->zb_level; } /* Now that we have a canonical representation, do the comparison. */ if (zb1obj != zb2obj) return (zb1obj < zb2obj ? -1 : 1); else if (zb1L0 != zb2L0) return (zb1L0 < zb2L0 ? -1 : 1); else if (zb1level != zb2level) return (zb1level > zb2level ? -1 : 1); /* * This can (theoretically) happen if the bookmarks have the same object * and level, but different blkids, if the block sizes are not the same. * There is presently no way to change the indirect block sizes */ return (0); } /* * This function checks the following: given that last_block is the place that * our traversal stopped last time, does that guarantee that we've visited * every node under subtree_root? Therefore, we can't just use the raw output * of zbookmark_compare. We have to pass in a modified version of * subtree_root; by incrementing the block id, and then checking whether * last_block is before or equal to that, we can tell whether or not having * visited last_block implies that all of subtree_root's children have been * visited. */ boolean_t zbookmark_subtree_completed(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { zbookmark_phys_t mod_zb = *subtree_root; mod_zb.zb_blkid++; ASSERT(last_block->zb_level == 0); /* The objset_phys_t isn't before anything. */ if (dnp == NULL) return (B_FALSE); /* * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the * data block size in sectors, because that variable is only used if * the bookmark refers to a block in the meta-dnode. Since we don't * know without examining it what object it refers to, and there's no * harm in passing in this value in other cases, we always pass it in. * * We pass in 0 for the indirect block size shift because zb2 must be * level 0. The indirect block size is only used to calculate the span * of the bookmark, but since the bookmark must be level 0, the span is * always 1, so the math works out. * * If you make changes to how the zbookmark_compare code works, be sure * to make sure that this code still works afterwards. */ return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, last_block) <= 0); } EXPORT_SYMBOL(zio_type_name); EXPORT_SYMBOL(zio_buf_alloc); EXPORT_SYMBOL(zio_data_buf_alloc); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW, "Max I/O completion time (milliseconds) before marking it as slow"); ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW, "Prioritize requeued I/O"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free, INT, ZMOD_RW, "Defer frees starting in this pass"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, INT, ZMOD_RW, "Don't compress starting in this pass"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, INT, ZMOD_RW, "Rewrite new bps starting in this pass"); ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW, "Throttle block allocations in the ZIO pipeline"); ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW, "Log all slow ZIOs, not just those with vdevs"); /* END CSTYLED */ Index: vendor-sys/openzfs/dist/tests/runfiles/common.run =================================================================== --- vendor-sys/openzfs/dist/tests/runfiles/common.run (revision 365339) +++ vendor-sys/openzfs/dist/tests/runfiles/common.run (revision 365340) @@ -1,900 +1,900 @@ # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # [DEFAULT] pre = setup quiet = False pre_user = root user = root timeout = 600 post_user = root post = cleanup failsafe_user = root failsafe = callbacks/zfs_failsafe outputdir = /var/tmp/test_results tags = ['functional'] [tests/functional/alloc_class] tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos', 'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos', 'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos', 'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos', 'alloc_class_013_pos'] tags = ['functional', 'alloc_class'] [tests/functional/atime] tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on'] tags = ['functional', 'atime'] [tests/functional/bootfs] tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos', 'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos', 'bootfs_008_pos'] tags = ['functional', 'bootfs'] [tests/functional/btree] tests = ['btree_positive', 'btree_negative'] tags = ['functional', 'btree'] pre = post = [tests/functional/cache] tests = ['cache_001_pos', 'cache_002_pos', 'cache_003_pos', 'cache_004_neg', 'cache_005_neg', 'cache_006_pos', 'cache_007_neg', 'cache_008_neg', 'cache_009_pos', 'cache_010_pos', 'cache_011_pos', 'cache_012_pos'] tags = ['functional', 'cache'] [tests/functional/cachefile] tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos', 'cachefile_004_pos'] tags = ['functional', 'cachefile'] [tests/functional/casenorm] tests = ['case_all_values', 'norm_all_values', 'mixed_create_failure', 'sensitive_none_lookup', 'sensitive_none_delete', 'sensitive_formd_lookup', 'sensitive_formd_delete', 'insensitive_none_lookup', 'insensitive_none_delete', 'insensitive_formd_lookup', 'insensitive_formd_delete', 'mixed_none_lookup', 'mixed_none_lookup_ci', 'mixed_none_delete', 'mixed_formd_lookup', 'mixed_formd_lookup_ci', 'mixed_formd_delete'] tags = ['functional', 'casenorm'] [tests/functional/channel_program/lua_core] tests = ['tst.args_to_lua', 'tst.divide_by_zero', 'tst.exists', 'tst.integer_illegal', 'tst.integer_overflow', 'tst.language_functions_neg', 'tst.language_functions_pos', 'tst.large_prog', 'tst.libraries', 'tst.memory_limit', 'tst.nested_neg', 'tst.nested_pos', 'tst.nvlist_to_lua', 'tst.recursive_neg', 'tst.recursive_pos', 'tst.return_large', 'tst.return_nvlist_neg', 'tst.return_nvlist_pos', 'tst.return_recursive_table', 'tst.stack_gsub', 'tst.timeout'] tags = ['functional', 'channel_program', 'lua_core'] [tests/functional/channel_program/synctask_core] tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit', 'tst.get_index_props', 'tst.get_mountpoint', 'tst.get_neg', 'tst.get_number_props', 'tst.get_string_props', 'tst.get_type', 'tst.get_userquota', 'tst.get_written', 'tst.inherit', 'tst.list_bookmarks', 'tst.list_children', 'tst.list_clones', 'tst.list_holds', 'tst.list_snapshots', 'tst.list_system_props', 'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict', 'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult', 'tst.rollback_one', 'tst.set_props', 'tst.snapshot_destroy', 'tst.snapshot_neg', 'tst.snapshot_recursive', 'tst.snapshot_simple', 'tst.bookmark.create', 'tst.bookmark.copy', 'tst.terminate_by_signal' ] tags = ['functional', 'channel_program', 'synctask_core'] [tests/functional/checksum] tests = ['run_sha2_test', 'run_skein_test', 'filetest_001_pos'] tags = ['functional', 'checksum'] [tests/functional/clean_mirror] tests = [ 'clean_mirror_001_pos', 'clean_mirror_002_pos', 'clean_mirror_003_pos', 'clean_mirror_004_pos'] tags = ['functional', 'clean_mirror'] [tests/functional/cli_root/zdb] tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', 'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos', 'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress', 'zdb_display_block', 'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id', 'zdb_decompress_zstd'] pre = post = tags = ['functional', 'cli_root', 'zdb'] [tests/functional/cli_root/zfs] tests = ['zfs_001_neg', 'zfs_002_pos'] tags = ['functional', 'cli_root', 'zfs'] [tests/functional/cli_root/zfs_bookmark] tests = ['zfs_bookmark_cliargs'] tags = ['functional', 'cli_root', 'zfs_bookmark'] [tests/functional/cli_root/zfs_change-key] tests = ['zfs_change-key', 'zfs_change-key_child', 'zfs_change-key_format', 'zfs_change-key_inherit', 'zfs_change-key_load', 'zfs_change-key_location', 'zfs_change-key_pbkdf2iters', 'zfs_change-key_clones'] tags = ['functional', 'cli_root', 'zfs_change-key'] [tests/functional/cli_root/zfs_clone] tests = ['zfs_clone_001_neg', 'zfs_clone_002_pos', 'zfs_clone_003_pos', 'zfs_clone_004_pos', 'zfs_clone_005_pos', 'zfs_clone_006_pos', 'zfs_clone_007_pos', 'zfs_clone_008_neg', 'zfs_clone_009_neg', 'zfs_clone_010_pos', 'zfs_clone_encrypted', 'zfs_clone_deeply_nested'] tags = ['functional', 'cli_root', 'zfs_clone'] [tests/functional/cli_root/zfs_copies] tests = ['zfs_copies_001_pos', 'zfs_copies_002_pos', 'zfs_copies_003_pos', 'zfs_copies_004_neg', 'zfs_copies_005_neg', 'zfs_copies_006_pos'] tags = ['functional', 'cli_root', 'zfs_copies'] [tests/functional/cli_root/zfs_create] tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos', 'zfs_create_004_pos', 'zfs_create_005_pos', 'zfs_create_006_pos', 'zfs_create_007_pos', 'zfs_create_008_neg', 'zfs_create_009_neg', 'zfs_create_010_neg', 'zfs_create_011_pos', 'zfs_create_012_pos', 'zfs_create_013_pos', 'zfs_create_014_pos', 'zfs_create_encrypted', 'zfs_create_crypt_combos', 'zfs_create_dryrun', 'zfs_create_verbose'] tags = ['functional', 'cli_root', 'zfs_create'] [tests/functional/cli_root/zfs_destroy] tests = ['zfs_clone_livelist_condense_and_disable', 'zfs_clone_livelist_condense_races', 'zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos', 'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg', 'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos', 'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos', 'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_015_pos', 'zfs_destroy_016_pos', 'zfs_destroy_clone_livelist', 'zfs_destroy_dev_removal', 'zfs_destroy_dev_removal_condense'] tags = ['functional', 'cli_root', 'zfs_destroy'] [tests/functional/cli_root/zfs_diff] tests = ['zfs_diff_changes', 'zfs_diff_cliargs', 'zfs_diff_timestamp', 'zfs_diff_types', 'zfs_diff_encrypted'] tags = ['functional', 'cli_root', 'zfs_diff'] [tests/functional/cli_root/zfs_get] tests = ['zfs_get_001_pos', 'zfs_get_002_pos', 'zfs_get_003_pos', 'zfs_get_004_pos', 'zfs_get_005_neg', 'zfs_get_006_neg', 'zfs_get_007_neg', 'zfs_get_008_pos', 'zfs_get_009_pos', 'zfs_get_010_neg'] tags = ['functional', 'cli_root', 'zfs_get'] [tests/functional/cli_root/zfs_ids_to_path] tests = ['zfs_ids_to_path_001_pos'] tags = ['functional', 'cli_root', 'zfs_ids_to_path'] [tests/functional/cli_root/zfs_inherit] tests = ['zfs_inherit_001_neg', 'zfs_inherit_002_neg', 'zfs_inherit_003_pos', 'zfs_inherit_mountpoint'] tags = ['functional', 'cli_root', 'zfs_inherit'] [tests/functional/cli_root/zfs_load-key] tests = ['zfs_load-key', 'zfs_load-key_all', 'zfs_load-key_file', 'zfs_load-key_location', 'zfs_load-key_noop', 'zfs_load-key_recursive'] tags = ['functional', 'cli_root', 'zfs_load-key'] [tests/functional/cli_root/zfs_mount] tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', 'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos', 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted', 'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', 'zfs_mount_test_race'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_program] tests = ['zfs_program_json'] tags = ['functional', 'cli_root', 'zfs_program'] [tests/functional/cli_root/zfs_promote] tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos', 'zfs_promote_004_pos', 'zfs_promote_005_pos', 'zfs_promote_006_neg', 'zfs_promote_007_neg', 'zfs_promote_008_pos', 'zfs_promote_encryptionroot'] tags = ['functional', 'cli_root', 'zfs_promote'] [tests/functional/cli_root/zfs_property] tests = ['zfs_written_property_001_pos'] tags = ['functional', 'cli_root', 'zfs_property'] [tests/functional/cli_root/zfs_receive] tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos', 'zfs_receive_004_neg', 'zfs_receive_005_neg', 'zfs_receive_006_pos', 'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg', 'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos', 'zfs_receive_013_pos', 'zfs_receive_014_pos', 'zfs_receive_015_pos', 'zfs_receive_016_pos', 'receive-o-x_props_override', 'zfs_receive_from_encrypted', 'zfs_receive_to_encrypted', 'zfs_receive_raw', 'zfs_receive_raw_incremental', 'zfs_receive_-e', 'zfs_receive_raw_-d', 'zfs_receive_from_zstd', 'zfs_receive_new_props'] tags = ['functional', 'cli_root', 'zfs_receive'] [tests/functional/cli_root/zfs_rename] tests = ['zfs_rename_001_pos', 'zfs_rename_002_pos', 'zfs_rename_003_pos', 'zfs_rename_004_neg', 'zfs_rename_005_neg', 'zfs_rename_006_pos', 'zfs_rename_007_pos', 'zfs_rename_008_pos', 'zfs_rename_009_neg', 'zfs_rename_010_neg', 'zfs_rename_011_pos', 'zfs_rename_012_neg', 'zfs_rename_013_pos', 'zfs_rename_014_neg', 'zfs_rename_encrypted_child', - 'zfs_rename_to_encrypted', 'zfs_rename_mountpoint'] + 'zfs_rename_to_encrypted', 'zfs_rename_mountpoint', 'zfs_rename_nounmount'] tags = ['functional', 'cli_root', 'zfs_rename'] [tests/functional/cli_root/zfs_reservation] tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] tags = ['functional', 'cli_root', 'zfs_reservation'] [tests/functional/cli_root/zfs_rollback] tests = ['zfs_rollback_001_pos', 'zfs_rollback_002_pos', 'zfs_rollback_003_neg', 'zfs_rollback_004_neg'] tags = ['functional', 'cli_root', 'zfs_rollback'] [tests/functional/cli_root/zfs_send] tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos', 'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos', 'zfs_send_007_pos', 'zfs_send_encrypted', 'zfs_send_raw', 'zfs_send_sparse', 'zfs_send-b'] tags = ['functional', 'cli_root', 'zfs_send'] [tests/functional/cli_root/zfs_set] tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos', 'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos', 'checksum_001_pos', 'compression_001_pos', 'mountpoint_001_pos', 'mountpoint_002_pos', 'reservation_001_neg', 'user_property_002_pos', 'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos', 'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos', 'user_property_004_pos', 'version_001_neg', 'zfs_set_001_neg', 'zfs_set_002_neg', 'zfs_set_003_neg', 'property_alias_001_pos', 'mountpoint_003_pos', 'ro_props_001_pos', 'zfs_set_keylocation', 'zfs_set_feature_activation'] tags = ['functional', 'cli_root', 'zfs_set'] [tests/functional/cli_root/zfs_share] tests = ['zfs_share_001_pos', 'zfs_share_002_pos', 'zfs_share_003_pos', 'zfs_share_004_pos', 'zfs_share_006_pos', 'zfs_share_008_neg', 'zfs_share_010_neg', 'zfs_share_011_pos', 'zfs_share_concurrent_shares'] tags = ['functional', 'cli_root', 'zfs_share'] [tests/functional/cli_root/zfs_snapshot] tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg', 'zfs_snapshot_003_neg', 'zfs_snapshot_004_neg', 'zfs_snapshot_005_neg', 'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg', 'zfs_snapshot_008_neg', 'zfs_snapshot_009_pos'] tags = ['functional', 'cli_root', 'zfs_snapshot'] [tests/functional/cli_root/zfs_unload-key] tests = ['zfs_unload-key', 'zfs_unload-key_all', 'zfs_unload-key_recursive'] tags = ['functional', 'cli_root', 'zfs_unload-key'] [tests/functional/cli_root/zfs_unmount] tests = ['zfs_unmount_001_pos', 'zfs_unmount_002_pos', 'zfs_unmount_003_pos', 'zfs_unmount_004_pos', 'zfs_unmount_005_pos', 'zfs_unmount_006_pos', 'zfs_unmount_007_neg', 'zfs_unmount_008_neg', 'zfs_unmount_009_pos', 'zfs_unmount_all_001_pos', 'zfs_unmount_nested', 'zfs_unmount_unload_keys'] tags = ['functional', 'cli_root', 'zfs_unmount'] [tests/functional/cli_root/zfs_unshare] tests = ['zfs_unshare_001_pos', 'zfs_unshare_002_pos', 'zfs_unshare_003_pos', 'zfs_unshare_004_neg', 'zfs_unshare_005_neg', 'zfs_unshare_006_pos', 'zfs_unshare_007_pos'] tags = ['functional', 'cli_root', 'zfs_unshare'] [tests/functional/cli_root/zfs_upgrade] tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_003_pos', 'zfs_upgrade_004_pos', 'zfs_upgrade_005_pos', 'zfs_upgrade_006_neg', 'zfs_upgrade_007_neg'] tags = ['functional', 'cli_root', 'zfs_upgrade'] [tests/functional/cli_root/zfs_wait] tests = ['zfs_wait_deleteq'] tags = ['functional', 'cli_root', 'zfs_wait'] [tests/functional/cli_root/zpool] tests = ['zpool_001_neg', 'zpool_002_pos', 'zpool_003_pos', 'zpool_colors'] tags = ['functional', 'cli_root', 'zpool'] [tests/functional/cli_root/zpool_add] tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos', 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos', 'add-o_ashift', 'add_prop_ashift'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_attach] tests = ['zpool_attach_001_neg', 'attach-o_ashift'] tags = ['functional', 'cli_root', 'zpool_attach'] [tests/functional/cli_root/zpool_clear] tests = ['zpool_clear_001_pos', 'zpool_clear_002_neg', 'zpool_clear_003_neg', 'zpool_clear_readonly'] tags = ['functional', 'cli_root', 'zpool_clear'] [tests/functional/cli_root/zpool_create] tests = ['zpool_create_001_pos', 'zpool_create_002_pos', 'zpool_create_003_pos', 'zpool_create_004_pos', 'zpool_create_005_pos', 'zpool_create_006_pos', 'zpool_create_007_neg', 'zpool_create_008_pos', 'zpool_create_009_neg', 'zpool_create_010_neg', 'zpool_create_011_neg', 'zpool_create_012_neg', 'zpool_create_014_neg', 'zpool_create_015_neg', 'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', 'zpool_create_023_neg', 'zpool_create_024_pos', 'zpool_create_encrypted', 'zpool_create_crypt_combos', 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', 'zpool_create_features_005_pos', 'create-o_ashift', 'zpool_create_tempname'] tags = ['functional', 'cli_root', 'zpool_create'] [tests/functional/cli_root/zpool_destroy] tests = ['zpool_destroy_001_pos', 'zpool_destroy_002_pos', 'zpool_destroy_003_neg'] pre = post = tags = ['functional', 'cli_root', 'zpool_destroy'] [tests/functional/cli_root/zpool_detach] tests = ['zpool_detach_001_neg'] tags = ['functional', 'cli_root', 'zpool_detach'] [tests/functional/cli_root/zpool_events] tests = ['zpool_events_clear', 'zpool_events_cliargs', 'zpool_events_follow', 'zpool_events_poolname', 'zpool_events_errors'] tags = ['functional', 'cli_root', 'zpool_events'] [tests/functional/cli_root/zpool_export] tests = ['zpool_export_001_pos', 'zpool_export_002_pos', 'zpool_export_003_neg', 'zpool_export_004_pos'] tags = ['functional', 'cli_root', 'zpool_export'] [tests/functional/cli_root/zpool_get] tests = ['zpool_get_001_pos', 'zpool_get_002_pos', 'zpool_get_003_pos', 'zpool_get_004_neg', 'zpool_get_005_pos'] tags = ['functional', 'cli_root', 'zpool_get'] [tests/functional/cli_root/zpool_history] tests = ['zpool_history_001_neg', 'zpool_history_002_pos'] tags = ['functional', 'cli_root', 'zpool_history'] [tests/functional/cli_root/zpool_import] tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'zpool_import_003_pos', 'zpool_import_004_pos', 'zpool_import_005_pos', 'zpool_import_006_pos', 'zpool_import_007_pos', 'zpool_import_008_pos', 'zpool_import_009_neg', 'zpool_import_010_pos', 'zpool_import_011_neg', 'zpool_import_012_pos', 'zpool_import_013_neg', 'zpool_import_014_pos', 'zpool_import_015_pos', 'zpool_import_features_001_pos', 'zpool_import_features_002_neg', 'zpool_import_features_003_pos', 'zpool_import_missing_001_pos', 'zpool_import_missing_002_pos', 'zpool_import_missing_003_pos', 'zpool_import_rename_001_pos', 'zpool_import_all_001_pos', 'zpool_import_encrypted', 'zpool_import_encrypted_load', 'zpool_import_errata3', 'zpool_import_errata4', 'import_cachefile_device_added', 'import_cachefile_device_removed', 'import_cachefile_device_replaced', 'import_cachefile_mirror_attached', 'import_cachefile_mirror_detached', 'import_cachefile_shared_device', 'import_devices_missing', 'import_paths_changed', 'import_rewind_config_changed', 'import_rewind_device_replaced'] tags = ['functional', 'cli_root', 'zpool_import'] [tests/functional/cli_root/zpool_labelclear] tests = ['zpool_labelclear_active', 'zpool_labelclear_exported', 'zpool_labelclear_removed', 'zpool_labelclear_valid'] pre = post = tags = ['functional', 'cli_root', 'zpool_labelclear'] [tests/functional/cli_root/zpool_initialize] tests = ['zpool_initialize_attach_detach_add_remove', 'zpool_initialize_import_export', 'zpool_initialize_offline_export_import_online', 'zpool_initialize_online_offline', 'zpool_initialize_split', 'zpool_initialize_start_and_cancel_neg', 'zpool_initialize_start_and_cancel_pos', 'zpool_initialize_suspend_resume', 'zpool_initialize_unsupported_vdevs', 'zpool_initialize_verify_checksums', 'zpool_initialize_verify_initialized'] pre = tags = ['functional', 'cli_root', 'zpool_initialize'] [tests/functional/cli_root/zpool_offline] tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg', 'zpool_offline_003_pos'] tags = ['functional', 'cli_root', 'zpool_offline'] [tests/functional/cli_root/zpool_online] tests = ['zpool_online_001_pos', 'zpool_online_002_neg'] tags = ['functional', 'cli_root', 'zpool_online'] [tests/functional/cli_root/zpool_remove] tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos', 'zpool_remove_003_pos'] tags = ['functional', 'cli_root', 'zpool_remove'] [tests/functional/cli_root/zpool_replace] tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift'] tags = ['functional', 'cli_root', 'zpool_replace'] [tests/functional/cli_root/zpool_resilver] tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart'] tags = ['functional', 'cli_root', 'zpool_resilver'] [tests/functional/cli_root/zpool_scrub] tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', 'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing', 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies'] tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', 'zpool_set_ashift', 'zpool_set_features'] tags = ['functional', 'cli_root', 'zpool_set'] [tests/functional/cli_root/zpool_split] tests = ['zpool_split_cliargs', 'zpool_split_devices', 'zpool_split_encryption', 'zpool_split_props', 'zpool_split_vdevs', 'zpool_split_resilver', 'zpool_split_indirect'] tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/cli_root/zpool_status] tests = ['zpool_status_001_pos', 'zpool_status_002_pos'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] tests = ['zpool_sync_001_pos', 'zpool_sync_002_neg'] tags = ['functional', 'cli_root', 'zpool_sync'] [tests/functional/cli_root/zpool_trim] tests = ['zpool_trim_attach_detach_add_remove', 'zpool_trim_import_export', 'zpool_trim_multiple', 'zpool_trim_neg', 'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline', 'zpool_trim_partial', 'zpool_trim_rate', 'zpool_trim_rate_neg', 'zpool_trim_secure', 'zpool_trim_split', 'zpool_trim_start_and_cancel_neg', 'zpool_trim_start_and_cancel_pos', 'zpool_trim_suspend_resume', 'zpool_trim_unsupported_vdevs', 'zpool_trim_verify_checksums', 'zpool_trim_verify_trimmed'] tags = ['functional', 'zpool_trim'] [tests/functional/cli_root/zpool_upgrade] tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_002_pos', 'zpool_upgrade_003_pos', 'zpool_upgrade_004_pos', 'zpool_upgrade_005_neg', 'zpool_upgrade_006_neg', 'zpool_upgrade_007_pos', 'zpool_upgrade_008_pos', 'zpool_upgrade_009_neg'] tags = ['functional', 'cli_root', 'zpool_upgrade'] [tests/functional/cli_root/zpool_wait] tests = ['zpool_wait_discard', 'zpool_wait_freeing', 'zpool_wait_initialize_basic', 'zpool_wait_initialize_cancel', 'zpool_wait_initialize_flag', 'zpool_wait_multiple', 'zpool_wait_no_activity', 'zpool_wait_remove', 'zpool_wait_remove_cancel', 'zpool_wait_trim_basic', 'zpool_wait_trim_cancel', 'zpool_wait_trim_flag', 'zpool_wait_usage'] tags = ['functional', 'cli_root', 'zpool_wait'] [tests/functional/cli_root/zpool_wait/scan] tests = ['zpool_wait_replace_cancel', 'zpool_wait_rebuild', 'zpool_wait_resilver', 'zpool_wait_scrub_cancel', 'zpool_wait_replace', 'zpool_wait_scrub_basic', 'zpool_wait_scrub_flag'] tags = ['functional', 'cli_root', 'zpool_wait'] [tests/functional/cli_user/misc] tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', 'zfs_clone_001_neg', 'zfs_create_001_neg', 'zfs_destroy_001_neg', 'zfs_get_001_neg', 'zfs_inherit_001_neg', 'zfs_mount_001_neg', 'zfs_promote_001_neg', 'zfs_receive_001_neg', 'zfs_rename_001_neg', 'zfs_rollback_001_neg', 'zfs_send_001_neg', 'zfs_set_001_neg', 'zfs_share_001_neg', 'zfs_snapshot_001_neg', 'zfs_unallow_001_neg', 'zfs_unmount_001_neg', 'zfs_unshare_001_neg', 'zfs_upgrade_001_neg', 'zpool_001_neg', 'zpool_add_001_neg', 'zpool_attach_001_neg', 'zpool_clear_001_neg', 'zpool_create_001_neg', 'zpool_destroy_001_neg', 'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg', 'zpool_history_001_neg', 'zpool_import_001_neg', 'zpool_import_002_neg', 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', 'zpool_replace_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', 'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege'] user = tags = ['functional', 'cli_user', 'misc'] [tests/functional/cli_user/zfs_list] tests = ['zfs_list_001_pos', 'zfs_list_002_pos', 'zfs_list_003_pos', 'zfs_list_004_neg', 'zfs_list_007_pos', 'zfs_list_008_neg'] user = tags = ['functional', 'cli_user', 'zfs_list'] [tests/functional/cli_user/zpool_iostat] tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', 'zpool_iostat_003_neg', 'zpool_iostat_004_pos', 'zpool_iostat_005_pos', 'zpool_iostat_-c_disable', 'zpool_iostat_-c_homedir', 'zpool_iostat_-c_searchpath'] user = tags = ['functional', 'cli_user', 'zpool_iostat'] [tests/functional/cli_user/zpool_list] tests = ['zpool_list_001_pos', 'zpool_list_002_neg'] user = tags = ['functional', 'cli_user', 'zpool_list'] [tests/functional/cli_user/zpool_status] tests = ['zpool_status_003_pos', 'zpool_status_-c_disable', 'zpool_status_-c_homedir', 'zpool_status_-c_searchpath'] user = tags = ['functional', 'cli_user', 'zpool_status'] [tests/functional/compression] tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos', 'l2arc_compressed_arc', 'l2arc_compressed_arc_disabled', 'l2arc_encrypted', 'l2arc_encrypted_no_compressed_arc'] tags = ['functional', 'compression'] [tests/functional/cp_files] tests = ['cp_files_001_pos'] tags = ['functional', 'cp_files'] [tests/functional/ctime] tests = ['ctime_001_pos' ] tags = ['functional', 'ctime'] [tests/functional/delegate] tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos', 'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos', 'zfs_allow_007_pos', 'zfs_allow_008_pos', 'zfs_allow_009_neg', 'zfs_allow_010_pos', 'zfs_allow_011_neg', 'zfs_allow_012_neg', 'zfs_unallow_001_pos', 'zfs_unallow_002_pos', 'zfs_unallow_003_pos', 'zfs_unallow_004_pos', 'zfs_unallow_005_pos', 'zfs_unallow_006_pos', 'zfs_unallow_007_neg', 'zfs_unallow_008_neg'] tags = ['functional', 'delegate'] [tests/functional/exec] tests = ['exec_001_pos', 'exec_002_neg'] tags = ['functional', 'exec'] [tests/functional/features/async_destroy] tests = ['async_destroy_001_pos'] tags = ['functional', 'features', 'async_destroy'] [tests/functional/features/large_dnode] tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_007_neg', 'large_dnode_009_pos'] tags = ['functional', 'features', 'large_dnode'] [tests/functional/grow] pre = post = tests = ['grow_pool_001_pos', 'grow_replicas_001_pos'] tags = ['functional', 'grow'] [tests/functional/history] tests = ['history_001_pos', 'history_002_pos', 'history_003_pos', 'history_004_pos', 'history_005_neg', 'history_006_neg', 'history_007_pos', 'history_008_pos', 'history_009_pos', 'history_010_pos'] tags = ['functional', 'history'] [tests/functional/hkdf] tests = ['run_hkdf_test'] tags = ['functional', 'hkdf'] [tests/functional/inheritance] tests = ['inherit_001_pos'] pre = tags = ['functional', 'inheritance'] [tests/functional/io] tests = ['sync', 'psync', 'posixaio', 'mmap'] tags = ['functional', 'io'] [tests/functional/inuse] tests = ['inuse_004_pos', 'inuse_005_pos', 'inuse_008_pos', 'inuse_009_pos'] post = tags = ['functional', 'inuse'] [tests/functional/large_files] tests = ['large_files_001_pos', 'large_files_002_pos'] tags = ['functional', 'large_files'] [tests/functional/largest_pool] tests = ['largest_pool_001_pos'] pre = post = tags = ['functional', 'largest_pool'] [tests/functional/limits] tests = ['filesystem_count', 'filesystem_limit', 'snapshot_count', 'snapshot_limit'] tags = ['functional', 'limits'] [tests/functional/link_count] tests = ['link_count_001', 'link_count_root_inode'] tags = ['functional', 'link_count'] [tests/functional/migration] tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos', 'migration_004_pos', 'migration_005_pos', 'migration_006_pos', 'migration_007_pos', 'migration_008_pos', 'migration_009_pos', 'migration_010_pos', 'migration_011_pos', 'migration_012_pos'] tags = ['functional', 'migration'] [tests/functional/mmap] tests = ['mmap_write_001_pos', 'mmap_read_001_pos'] tags = ['functional', 'mmap'] [tests/functional/mount] tests = ['umount_001', 'umountall_001'] tags = ['functional', 'mount'] [tests/functional/mv_files] tests = ['mv_files_001_pos', 'mv_files_002_pos', 'random_creation'] tags = ['functional', 'mv_files'] [tests/functional/nestedfs] tests = ['nestedfs_001_pos'] tags = ['functional', 'nestedfs'] [tests/functional/no_space] tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos', 'enospc_df'] tags = ['functional', 'no_space'] [tests/functional/nopwrite] tests = ['nopwrite_copies', 'nopwrite_mtime', 'nopwrite_negative', 'nopwrite_promoted_clone', 'nopwrite_recsize', 'nopwrite_sync', 'nopwrite_varying_compression', 'nopwrite_volume'] tags = ['functional', 'nopwrite'] [tests/functional/online_offline] tests = ['online_offline_001_pos', 'online_offline_002_neg', 'online_offline_003_neg'] tags = ['functional', 'online_offline'] [tests/functional/persist_l2arc] tests = ['persist_l2arc_001_pos', 'persist_l2arc_002_pos', 'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos', 'persist_l2arc_006_pos', 'persist_l2arc_007_pos', 'persist_l2arc_008_pos'] tags = ['functional', 'persist_l2arc'] [tests/functional/pool_checkpoint] tests = ['checkpoint_after_rewind', 'checkpoint_big_rewind', 'checkpoint_capacity', 'checkpoint_conf_change', 'checkpoint_discard', 'checkpoint_discard_busy', 'checkpoint_discard_many', 'checkpoint_indirect', 'checkpoint_invalid', 'checkpoint_lun_expsz', 'checkpoint_open', 'checkpoint_removal', 'checkpoint_rewind', 'checkpoint_ro_rewind', 'checkpoint_sm_scale', 'checkpoint_twice', 'checkpoint_vdev_add', 'checkpoint_zdb', 'checkpoint_zhack_feat'] tags = ['functional', 'pool_checkpoint'] timeout = 1800 [tests/functional/pool_names] tests = ['pool_names_001_pos', 'pool_names_002_neg'] pre = post = tags = ['functional', 'pool_names'] [tests/functional/poolversion] tests = ['poolversion_001_pos', 'poolversion_002_pos'] tags = ['functional', 'poolversion'] [tests/functional/pyzfs] tests = ['pyzfs_unittest'] pre = post = tags = ['functional', 'pyzfs'] [tests/functional/quota] tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos', 'quota_004_pos', 'quota_005_pos', 'quota_006_neg'] tags = ['functional', 'quota'] [tests/functional/redacted_send] tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted', 'redacted_disabled_feature', 'redacted_embedded', 'redacted_holes', 'redacted_incrementals', 'redacted_largeblocks', 'redacted_many_clones', 'redacted_mixed_recsize', 'redacted_mounts', 'redacted_negative', 'redacted_origin', 'redacted_props', 'redacted_resume', 'redacted_size', 'redacted_volume'] tags = ['functional', 'redacted_send'] [tests/functional/raidz] tests = ['raidz_001_neg', 'raidz_002_pos'] tags = ['functional', 'raidz'] [tests/functional/redundancy] tests = ['redundancy_001_pos', 'redundancy_002_pos', 'redundancy_003_pos', 'redundancy_004_neg'] tags = ['functional', 'redundancy'] [tests/functional/refquota] tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos', 'refquota_004_pos', 'refquota_005_pos', 'refquota_006_neg', 'refquota_007_neg', 'refquota_008_neg'] tags = ['functional', 'refquota'] [tests/functional/refreserv] tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', 'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_multi_raidz', 'refreserv_raidz'] tags = ['functional', 'refreserv'] [tests/functional/removal] pre = tests = ['removal_all_vdev', 'removal_cancel', 'removal_check_space', 'removal_condense_export', 'removal_multiple_indirection', 'removal_nopwrite', 'removal_remap_deadlists', 'removal_resume_export', 'removal_sanity', 'removal_with_add', 'removal_with_create_fs', 'removal_with_dedup', 'removal_with_errors', 'removal_with_export', 'removal_with_ganging', 'removal_with_faulted', 'removal_with_remove', 'removal_with_scrub', 'removal_with_send', 'removal_with_send_recv', 'removal_with_snapshot', 'removal_with_write', 'removal_with_zdb', 'remove_expanded', 'remove_mirror', 'remove_mirror_sanity', 'remove_raidz', 'remove_indirect'] tags = ['functional', 'removal'] [tests/functional/rename_dirs] tests = ['rename_dirs_001_pos'] tags = ['functional', 'rename_dirs'] [tests/functional/replacement] tests = ['attach_import', 'attach_multiple', 'attach_rebuild', 'attach_resilver', 'detach', 'rebuild_disabled_feature', 'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild', 'replace_resilver', 'resilver_restart_001', 'resilver_restart_002', 'scrub_cancel'] tags = ['functional', 'replacement'] [tests/functional/reservation] tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos', 'reservation_004_pos', 'reservation_005_pos', 'reservation_006_pos', 'reservation_007_pos', 'reservation_008_pos', 'reservation_009_pos', 'reservation_010_pos', 'reservation_011_pos', 'reservation_012_pos', 'reservation_013_pos', 'reservation_014_pos', 'reservation_015_pos', 'reservation_016_pos', 'reservation_017_pos', 'reservation_018_pos', 'reservation_019_pos', 'reservation_020_pos', 'reservation_021_neg', 'reservation_022_pos'] tags = ['functional', 'reservation'] [tests/functional/rootpool] tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_pos'] tags = ['functional', 'rootpool'] [tests/functional/rsend] tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'rsend_005_pos', 'rsend_006_pos', 'rsend_007_pos', 'rsend_008_pos', 'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos', 'rsend_013_pos', 'rsend_014_pos', 'rsend_016_neg', 'rsend_019_pos', 'rsend_020_pos', 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos', 'send-c_verify_ratio', 'send-c_verify_contents', 'send-c_props', 'send-c_incremental', 'send-c_volume', 'send-c_zstreamdump', 'send-c_lz4_disabled', 'send-c_recv_lz4_disabled', 'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize', 'send-c_recv_dedup', 'send-L_toggle', 'send_encrypted_hierarchy', 'send_encrypted_props', 'send_encrypted_truncated_files', 'send_freeobjects', 'send_realloc_files', 'send_realloc_encrypted_files', 'send_spill_block', 'send_holds', 'send_hole_birth', 'send_mixed_raw', 'send-wR_encrypted_zvol', 'send_partial_dataset'] tags = ['functional', 'rsend'] [tests/functional/scrub_mirror] tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos', 'scrub_mirror_003_pos', 'scrub_mirror_004_pos'] tags = ['functional', 'scrub_mirror'] [tests/functional/slog] tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos', 'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg', 'slog_009_neg', 'slog_010_neg', 'slog_011_neg', 'slog_012_neg', 'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs_001', 'slog_replay_fs_002', 'slog_replay_volume'] tags = ['functional', 'slog'] [tests/functional/snapshot] tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos', 'rollback_003_pos', 'snapshot_001_pos', 'snapshot_002_pos', 'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos', 'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos', 'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos', 'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos', 'snapshot_017_pos'] tags = ['functional', 'snapshot'] [tests/functional/snapused] tests = ['snapused_001_pos', 'snapused_002_pos', 'snapused_003_pos', 'snapused_004_pos', 'snapused_005_pos'] tags = ['functional', 'snapused'] [tests/functional/sparse] tests = ['sparse_001_pos'] tags = ['functional', 'sparse'] [tests/functional/suid] tests = ['suid_write_to_suid', 'suid_write_to_sgid', 'suid_write_to_suid_sgid', 'suid_write_to_none'] tags = ['functional', 'suid'] [tests/functional/threadsappend] tests = ['threadsappend_001_pos'] tags = ['functional', 'threadsappend'] [tests/functional/trim] tests = ['autotrim_integrity', 'autotrim_config', 'autotrim_trim_integrity', 'trim_integrity', 'trim_config', 'trim_l2arc'] tags = ['functional', 'trim'] [tests/functional/truncate] tests = ['truncate_001_pos', 'truncate_002_pos', 'truncate_timestamps'] tags = ['functional', 'truncate'] [tests/functional/upgrade] tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool'] tags = ['functional', 'upgrade'] [tests/functional/userquota] tests = [ 'userquota_001_pos', 'userquota_002_pos', 'userquota_003_pos', 'userquota_004_pos', 'userquota_005_neg', 'userquota_006_pos', 'userquota_007_pos', 'userquota_008_pos', 'userquota_009_pos', 'userquota_010_pos', 'userquota_011_pos', 'userquota_012_neg', 'userspace_001_pos', 'userspace_002_pos'] tags = ['functional', 'userquota'] [tests/functional/vdev_zaps] tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos', 'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos', 'vdev_zaps_007_pos'] tags = ['functional', 'vdev_zaps'] [tests/functional/write_dirs] tests = ['write_dirs_001_pos', 'write_dirs_002_pos'] tags = ['functional', 'write_dirs'] [tests/functional/xattr] tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos', 'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg', 'xattr_011_pos', 'xattr_012_pos', 'xattr_013_pos'] tags = ['functional', 'xattr'] [tests/functional/zvol/zvol_ENOSPC] tests = ['zvol_ENOSPC_001_pos'] tags = ['functional', 'zvol', 'zvol_ENOSPC'] [tests/functional/zvol/zvol_cli] tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg'] tags = ['functional', 'zvol', 'zvol_cli'] [tests/functional/zvol/zvol_misc] tests = ['zvol_misc_002_pos', 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse', 'zvol_misc_snapdev', 'zvol_misc_volmode', 'zvol_misc_zil'] tags = ['functional', 'zvol', 'zvol_misc'] [tests/functional/zvol/zvol_swap] tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_004_pos'] tags = ['functional', 'zvol', 'zvol_swap'] [tests/functional/libzfs] tests = ['many_fds', 'libzfs_input'] tags = ['functional', 'libzfs'] [tests/functional/log_spacemap] tests = ['log_spacemap_import_logs'] pre = post = tags = ['functional', 'log_spacemap'] Index: vendor-sys/openzfs/dist/tests/zfs-tests/include/tunables.cfg =================================================================== --- vendor-sys/openzfs/dist/tests/zfs-tests/include/tunables.cfg (revision 365339) +++ vendor-sys/openzfs/dist/tests/zfs-tests/include/tunables.cfg (revision 365340) @@ -1,89 +1,89 @@ # This file exports variables for each tunable used in the test suite. # # Different platforms use different names for most tunables. To avoid littering # the tests with conditional logic for deciding how to set each tunable, the # logic is instead consolidated to this one file. # # Any use of tunables in tests must use a name defined here. New entries # should be added to the table as needed. Please keep the table sorted # alphabetically for ease of maintenance. # # Platform-specific tunables should still use a NAME from this table for # consistency. Enter UNSUPPORTED in the column for platforms on which the # tunable is not implemented. UNAME=$(uname) # NAME FreeBSD tunable Linux tunable cat <<%%%% | ADMIN_SNAPSHOT UNSUPPORTED zfs_admin_snapshot ALLOW_REDACTED_DATASET_MOUNT allow_redacted_dataset_mount zfs_allow_redacted_dataset_mount ARC_MAX arc.max zfs_arc_max ARC_MIN arc.min zfs_arc_min ASYNC_BLOCK_MAX_BLOCKS async_block_max_blocks zfs_async_block_max_blocks CHECKSUM_EVENTS_PER_SECOND checksum_events_per_second zfs_checksum_events_per_second COMMIT_TIMEOUT_PCT commit_timeout_pct zfs_commit_timeout_pct COMPRESSED_ARC_ENABLED compressed_arc_enabled zfs_compressed_arc_enabled CONDENSE_INDIRECT_COMMIT_ENTRY_DELAY_MS condense.indirect_commit_entry_delay_ms zfs_condense_indirect_commit_entry_delay_ms CONDENSE_MIN_MAPPING_BYTES condense.min_mapping_bytes zfs_condense_min_mapping_bytes DBUF_CACHE_MAX_BYTES dbuf_cache.max_bytes dbuf_cache_max_bytes DEADMAN_CHECKTIME_MS deadman_checktime_ms zfs_deadman_checktime_ms DEADMAN_FAILMODE deadman_failmode zfs_deadman_failmode DEADMAN_SYNCTIME_MS deadman_synctime_ms zfs_deadman_synctime_ms DEADMAN_ZIOTIME_MS deadman_ziotime_ms zfs_deadman_ziotime_ms DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check INITIALIZE_CHUNK_SIZE initialize_chunk_size zfs_initialize_chunk_size INITIALIZE_VALUE initialize_value zfs_initialize_value KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps_at_export LUA_MAX_MEMLIMIT lua.max_memlimit zfs_lua_max_memlimit L2ARC_NOPREFETCH l2arc.noprefetch l2arc_noprefetch L2ARC_REBUILD_BLOCKS_MIN_L2SIZE l2arc.rebuild_blocks_min_l2size l2arc_rebuild_blocks_min_l2size L2ARC_REBUILD_ENABLED l2arc.rebuild_enabled l2arc_rebuild_enabled L2ARC_TRIM_AHEAD l2arc.trim_ahead l2arc_trim_ahead L2ARC_WRITE_BOOST l2arc.write_boost l2arc_write_boost L2ARC_WRITE_MAX l2arc.write_max l2arc_write_max LIVELIST_CONDENSE_NEW_ALLOC livelist.condense.new_alloc zfs_livelist_condense_new_alloc LIVELIST_CONDENSE_SYNC_CANCEL livelist.condense.sync_cancel zfs_livelist_condense_sync_cancel LIVELIST_CONDENSE_SYNC_PAUSE livelist.condense.sync_pause zfs_livelist_condense_sync_pause LIVELIST_CONDENSE_ZTHR_CANCEL livelist.condense.zthr_cancel zfs_livelist_condense_zthr_cancel LIVELIST_CONDENSE_ZTHR_PAUSE livelist.condense.zthr_pause zfs_livelist_condense_zthr_pause LIVELIST_MAX_ENTRIES livelist.max_entries zfs_livelist_max_entries LIVELIST_MIN_PERCENT_SHARED livelist.min_percent_shared zfs_livelist_min_percent_shared MAX_DATASET_NESTING max_dataset_nesting zfs_max_dataset_nesting MAX_MISSING_TVDS max_missing_tvds zfs_max_missing_tvds METASLAB_DEBUG_LOAD metaslab.debug_load metaslab_debug_load METASLAB_FORCE_GANGING metaslab.force_ganging metaslab_force_ganging MULTIHOST_FAIL_INTERVALS multihost.fail_intervals zfs_multihost_fail_intervals -MULTIHOST_HISTORY UNSUPPORTED zfs_multihost_history +MULTIHOST_HISTORY multihost.history zfs_multihost_history MULTIHOST_IMPORT_INTERVALS multihost.import_intervals zfs_multihost_import_intervals MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment RESILVER_MIN_TIME_MS resilver_min_time_ms zfs_resilver_min_time_ms SCAN_LEGACY scan_legacy zfs_scan_legacy SCAN_SUSPEND_PROGRESS scan_suspend_progress zfs_scan_suspend_progress SCAN_VDEV_LIMIT scan_vdev_limit zfs_scan_vdev_limit SEND_HOLES_WITHOUT_BIRTH_TIME send_holes_without_birth_time send_holes_without_birth_time SLOW_IO_EVENTS_PER_SECOND slow_io_events_per_second zfs_slow_io_events_per_second SPA_ASIZE_INFLATION spa.asize_inflation spa_asize_inflation SPA_DISCARD_MEMORY_LIMIT spa.discard_memory_limit zfs_spa_discard_memory_limit SPA_LOAD_VERIFY_DATA spa.load_verify_data spa_load_verify_data SPA_LOAD_VERIFY_METADATA spa.load_verify_metadata spa_load_verify_metadata TRIM_EXTENT_BYTES_MIN trim.extent_bytes_min zfs_trim_extent_bytes_min TRIM_METASLAB_SKIP trim.metaslab_skip zfs_trim_metaslab_skip TRIM_TXG_BATCH trim.txg_batch zfs_trim_txg_batch -TXG_HISTORY UNSUPPORTED zfs_txg_history +TXG_HISTORY txg.history zfs_txg_history TXG_TIMEOUT txg.timeout zfs_txg_timeout UNLINK_SUSPEND_PROGRESS UNSUPPORTED zfs_unlink_suspend_progress VDEV_MIN_MS_COUNT vdev.min_ms_count zfs_vdev_min_ms_count VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZIO_SLOW_IO_MS zio.slow_io_ms zio_slow_io_ms %%%% while read name FreeBSD Linux; do eval "export ${name}=\$${UNAME}" done Index: vendor-sys/openzfs/dist/tests/zfs-tests/tests/functional/cli_root/zfs_rename/Makefile.am =================================================================== --- vendor-sys/openzfs/dist/tests/zfs-tests/tests/functional/cli_root/zfs_rename/Makefile.am (revision 365339) +++ vendor-sys/openzfs/dist/tests/zfs-tests/tests/functional/cli_root/zfs_rename/Makefile.am (revision 365340) @@ -1,25 +1,26 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zfs_rename dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ zfs_rename_001_pos.ksh \ zfs_rename_002_pos.ksh \ zfs_rename_003_pos.ksh \ zfs_rename_004_neg.ksh \ zfs_rename_005_neg.ksh \ zfs_rename_006_pos.ksh \ zfs_rename_007_pos.ksh \ zfs_rename_008_pos.ksh \ zfs_rename_009_neg.ksh \ zfs_rename_010_neg.ksh \ zfs_rename_011_pos.ksh \ zfs_rename_012_neg.ksh \ zfs_rename_013_pos.ksh \ zfs_rename_014_neg.ksh \ zfs_rename_encrypted_child.ksh \ zfs_rename_to_encrypted.ksh \ - zfs_rename_mountpoint.ksh + zfs_rename_mountpoint.ksh \ + zfs_rename_nounmount.ksh dist_pkgdata_DATA = \ zfs_rename.cfg \ zfs_rename.kshlib Index: vendor-sys/openzfs/dist/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_mountpoint.ksh =================================================================== --- vendor-sys/openzfs/dist/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_mountpoint.ksh (revision 365339) +++ vendor-sys/openzfs/dist/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_mountpoint.ksh (revision 365340) @@ -1,88 +1,88 @@ #!/bin/ksh -p # # CDDL HEADER START # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy is of the CDDL is also available via the Internet # at http://www.illumos.org/license/CDDL. # # CDDL HEADER END # # # Copyright (c) 2018 Datto Inc. # . $STF_SUITE/include/libtest.shlib # # DESCRIPTION: # zfs rename should rename datasets even for mountpoint=none children # # STRATEGY: # 1. Create a set of nested datasets with mountpoint=none # 2. Verify datasets aren't mounted except for the parent # 3. Rename mountpoint and verify all child datasets are renamed # verify_runnable "both" function rename_cleanup { - log_note zfs destroy -fR $TESTPOOL/rename_test - log_note zfs destroy -fR $TESTPOOL/renamed + zfs destroy -fR $TESTPOOL/rename_test + zfs destroy -fR $TESTPOOL/renamed } log_onexit rename_cleanup log_must zfs create $TESTPOOL/rename_test log_must zfs create $TESTPOOL/rename_test/ds log_must zfs create -o mountpoint=none $TESTPOOL/rename_test/child log_must zfs create $TESTPOOL/rename_test/child/grandchild if ! ismounted $TESTPOOL/rename_test; then log_fail "$TESTPOOL/rename_test is not mounted" fi if ! ismounted $TESTPOOL/rename_test/ds; then log_fail "$TESTPOOL/rename_test/ds is not mounted" fi if ismounted $TESTPOOL/rename_test/child; then log_fail "$TESTPOOL/rename_test/child is mounted" fi if ismounted $TESTPOOL/rename_test/child/grandchild; then log_fail "$TESTPOOL/rename_test/child/grandchild is mounted" fi log_must zfs rename $TESTPOOL/rename_test $TESTPOOL/renamed log_mustnot zfs list $TESTPOOL/rename_test log_mustnot zfs list $TESTPOOL/rename_test/ds log_mustnot zfs list $TESTPOOL/rename_test/child log_mustnot zfs list $TESTPOOL/rename_test/child/grandchild log_must zfs list $TESTPOOL/renamed log_must zfs list $TESTPOOL/renamed/ds log_must zfs list $TESTPOOL/renamed/child log_must zfs list $TESTPOOL/renamed/child/grandchild if ! ismounted $TESTPOOL/renamed; then log_must zfs get all $TESTPOOL/renamed log_fail "$TESTPOOL/renamed is not mounted" fi if ! ismounted $TESTPOOL/renamed/ds; then log_fail "$TESTPOOL/renamed/ds is not mounted" fi if ismounted $TESTPOOL/renamed/child; then log_fail "$TESTPOOL/renamed/child is mounted" fi if ismounted $TESTPOOL/renamed/child/grandchild; then log_fail "$TESTPOOL/renamed/child/grandchild is mounted" fi log_pass "Verified rename for mountpoint=none children." Index: vendor-sys/openzfs/dist/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_nounmount.ksh =================================================================== --- vendor-sys/openzfs/dist/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_nounmount.ksh (nonexistent) +++ vendor-sys/openzfs/dist/tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_nounmount.ksh (revision 365340) @@ -0,0 +1,93 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy is of the CDDL is also available via the Internet +# at http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 iXsystems, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# zfs rename -u should rename datasets without unmounting them +# +# STRATEGY: +# 1. Create a set of nested datasets. +# 2. Verify datasets are mounted. +# 3. Rename with -u and verify all datasets stayed mounted. +# + +verify_runnable "both" + +function rename_cleanup +{ + cd $back + zfs destroy -fR $TESTPOOL/rename_test + zfs destroy -fR $TESTPOOL/renamed +} + +back=$(pwd) +log_onexit rename_cleanup + +log_must zfs create $TESTPOOL/rename_test +log_must zfs create $TESTPOOL/rename_test/child +log_must zfs create $TESTPOOL/rename_test/child/grandchild + +if ! ismounted $TESTPOOL/rename_test; then + log_fail "$TESTPOOL/rename_test is not mounted" +fi +if ! ismounted $TESTPOOL/rename_test/child; then + log_fail "$TESTPOOL/rename_test/child is not mounted" +fi +if ! ismounted $TESTPOOL/rename_test/child/grandchild; then + log_fail "$TESTPOOL/rename_test/child/grandchild is not mounted" +fi + +mntp_p=$(get_prop mountpoint $TESTPOOL/rename_test) +mntp_c=$(get_prop mountpoint $TESTPOOL/rename_test/child) +mntp_g=$(get_prop mountpoint $TESTPOOL/rename_test/child/grandchild) + +log_must cd $mntp_g +log_mustnot zfs rename $TESTPOOL/rename_test $TESTPOOL/renamed +log_must zfs rename -u $TESTPOOL/rename_test $TESTPOOL/renamed + +log_mustnot zfs list $TESTPOOL/rename_test +log_mustnot zfs list $TESTPOOL/rename_test/child +log_mustnot zfs list $TESTPOOL/rename_test/child/grandchild + +log_must zfs list $TESTPOOL/renamed +log_must zfs list $TESTPOOL/renamed/child +log_must zfs list $TESTPOOL/renamed/child/grandchild + +missing=$(zfs mount | awk -v pat=$TESTPOOL/renamed '$1 ~ pat' | awk \ + -v mntp_p=$mntp_p \ + -v mntp_c=$mntp_c \ + -v mntp_g=$mntp_g ' + BEGIN { p = c = g = 0 } + $2 == mntp_p { p = 1 } + $2 == mntp_c { c = 1 } + $2 == mntp_g { g = 1 } + END { + if (p != 1) + print mntp_p + if (c != 1) + print mntp_c + if (g != 1) + print mntp_g + }') +[[ -z "$missing" ]] || log_fail "Mountpoints no longer mounted: $missing" + +log_pass "Verified rename -u does not unmount datasets" Index: vendor-sys/openzfs/dist/zfs.release.in =================================================================== --- vendor-sys/openzfs/dist/zfs.release.in (revision 365339) +++ vendor-sys/openzfs/dist/zfs.release.in (revision 365340) Property changes on: vendor-sys/openzfs/dist/zfs.release.in ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property