diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 6a1e37a2e362..74bdd796e88f 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -1,8823 +1,8823 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2012 Milan Jurik. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2016 Igor Kozhukhov . * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, loli10K * Copyright 2019 Joyent, Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_IDMAP #include #include #endif /* HAVE_IDMAP */ #include "zfs_iter.h" #include "zfs_util.h" #include "zfs_comutil.h" #include "zfs_projectutil.h" libzfs_handle_t *g_zfs; static char history_str[HIS_MAX_RECORD_LEN]; static boolean_t log_history = B_TRUE; static int zfs_do_clone(int argc, char **argv); static int zfs_do_create(int argc, char **argv); static int zfs_do_destroy(int argc, char **argv); static int zfs_do_get(int argc, char **argv); static int zfs_do_inherit(int argc, char **argv); static int zfs_do_list(int argc, char **argv); static int zfs_do_mount(int argc, char **argv); static int zfs_do_rename(int argc, char **argv); static int zfs_do_rollback(int argc, char **argv); static int zfs_do_set(int argc, char **argv); static int zfs_do_upgrade(int argc, char **argv); static int zfs_do_snapshot(int argc, char **argv); static int zfs_do_unmount(int argc, char **argv); static int zfs_do_share(int argc, char **argv); static int zfs_do_unshare(int argc, char **argv); static int zfs_do_send(int argc, char **argv); static int zfs_do_receive(int argc, char **argv); static int zfs_do_promote(int argc, char **argv); static int zfs_do_userspace(int argc, char **argv); static int zfs_do_allow(int argc, char **argv); static int zfs_do_unallow(int argc, char **argv); static int zfs_do_hold(int argc, char **argv); static int zfs_do_holds(int argc, char **argv); static int zfs_do_release(int argc, char **argv); static int zfs_do_diff(int argc, char **argv); static int zfs_do_bookmark(int argc, char **argv); static int zfs_do_channel_program(int argc, char **argv); static int zfs_do_load_key(int argc, char **argv); static int zfs_do_unload_key(int argc, char **argv); static int zfs_do_change_key(int argc, char **argv); static int zfs_do_project(int argc, char **argv); static int zfs_do_version(int argc, char **argv); static int zfs_do_redact(int argc, char **argv); static int zfs_do_wait(int argc, char **argv); #ifdef __FreeBSD__ static int zfs_do_jail(int argc, char **argv); static int zfs_do_unjail(int argc, char **argv); #endif #ifdef __linux__ static int zfs_do_zone(int argc, char **argv); static int zfs_do_unzone(int argc, char **argv); #endif /* * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. */ #ifdef DEBUG const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } #endif typedef enum { HELP_CLONE, HELP_CREATE, HELP_DESTROY, HELP_GET, HELP_INHERIT, HELP_UPGRADE, HELP_LIST, HELP_MOUNT, HELP_PROMOTE, HELP_RECEIVE, HELP_RENAME, HELP_ROLLBACK, HELP_SEND, HELP_SET, HELP_SHARE, HELP_SNAPSHOT, HELP_UNMOUNT, HELP_UNSHARE, HELP_ALLOW, HELP_UNALLOW, HELP_USERSPACE, HELP_GROUPSPACE, HELP_PROJECTSPACE, HELP_PROJECT, HELP_HOLD, HELP_HOLDS, HELP_RELEASE, HELP_DIFF, HELP_BOOKMARK, HELP_CHANNEL_PROGRAM, HELP_LOAD_KEY, HELP_UNLOAD_KEY, HELP_CHANGE_KEY, HELP_VERSION, HELP_REDACT, HELP_JAIL, HELP_UNJAIL, HELP_WAIT, HELP_ZONE, HELP_UNZONE, } zfs_help_t; typedef struct zfs_command { const char *name; int (*func)(int argc, char **argv); zfs_help_t usage; } zfs_command_t; /* * Master command table. Each ZFS command has a name, associated function, and * usage message. The usage messages need to be internationalized, so we have * to have a function to return the usage message based on a command index. * * These commands are organized according to how they are displayed in the usage * message. An empty command (one with a NULL name) indicates an empty line in * the generic usage message. */ static zfs_command_t command_table[] = { { "version", zfs_do_version, HELP_VERSION }, { NULL }, { "create", zfs_do_create, HELP_CREATE }, { "destroy", zfs_do_destroy, HELP_DESTROY }, { NULL }, { "snapshot", zfs_do_snapshot, HELP_SNAPSHOT }, { "rollback", zfs_do_rollback, HELP_ROLLBACK }, { "clone", zfs_do_clone, HELP_CLONE }, { "promote", zfs_do_promote, HELP_PROMOTE }, { "rename", zfs_do_rename, HELP_RENAME }, { "bookmark", zfs_do_bookmark, HELP_BOOKMARK }, { "program", zfs_do_channel_program, HELP_CHANNEL_PROGRAM }, { NULL }, { "list", zfs_do_list, HELP_LIST }, { NULL }, { "set", zfs_do_set, HELP_SET }, { "get", zfs_do_get, HELP_GET }, { "inherit", zfs_do_inherit, HELP_INHERIT }, { "upgrade", zfs_do_upgrade, HELP_UPGRADE }, { NULL }, { "userspace", zfs_do_userspace, HELP_USERSPACE }, { "groupspace", zfs_do_userspace, HELP_GROUPSPACE }, { "projectspace", zfs_do_userspace, HELP_PROJECTSPACE }, { NULL }, { "project", zfs_do_project, HELP_PROJECT }, { NULL }, { "mount", zfs_do_mount, HELP_MOUNT }, { "unmount", zfs_do_unmount, HELP_UNMOUNT }, { "share", zfs_do_share, HELP_SHARE }, { "unshare", zfs_do_unshare, HELP_UNSHARE }, { NULL }, { "send", zfs_do_send, HELP_SEND }, { "receive", zfs_do_receive, HELP_RECEIVE }, { NULL }, { "allow", zfs_do_allow, HELP_ALLOW }, { NULL }, { "unallow", zfs_do_unallow, HELP_UNALLOW }, { NULL }, { "hold", zfs_do_hold, HELP_HOLD }, { "holds", zfs_do_holds, HELP_HOLDS }, { "release", zfs_do_release, HELP_RELEASE }, { "diff", zfs_do_diff, HELP_DIFF }, { "load-key", zfs_do_load_key, HELP_LOAD_KEY }, { "unload-key", zfs_do_unload_key, HELP_UNLOAD_KEY }, { "change-key", zfs_do_change_key, HELP_CHANGE_KEY }, { "redact", zfs_do_redact, HELP_REDACT }, { "wait", zfs_do_wait, HELP_WAIT }, #ifdef __FreeBSD__ { "jail", zfs_do_jail, HELP_JAIL }, { "unjail", zfs_do_unjail, HELP_UNJAIL }, #endif #ifdef __linux__ { "zone", zfs_do_zone, HELP_ZONE }, { "unzone", zfs_do_unzone, HELP_UNZONE }, #endif }; #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) zfs_command_t *current_command; static const char * get_usage(zfs_help_t idx) { switch (idx) { case HELP_CLONE: return (gettext("\tclone [-p] [-o property=value] ... " " \n")); case HELP_CREATE: return (gettext("\tcreate [-Pnpuv] [-o property=value] ... " "\n" "\tcreate [-Pnpsv] [-b blocksize] [-o property=value] ... " "-V \n")); case HELP_DESTROY: return (gettext("\tdestroy [-fnpRrv] \n" "\tdestroy [-dnpRrv] " "@[%][,...]\n" "\tdestroy #\n")); case HELP_GET: return (gettext("\tget [-rHp] [-d max] " "[-o \"all\" | field[,...]]\n" "\t [-t type[,...]] [-s source[,...]]\n" "\t <\"all\" | property[,...]> " "[filesystem|volume|snapshot|bookmark] ...\n")); case HELP_INHERIT: return (gettext("\tinherit [-rS] " " ...\n")); case HELP_UPGRADE: return (gettext("\tupgrade [-v]\n" "\tupgrade [-r] [-V version] <-a | filesystem ...>\n")); case HELP_LIST: return (gettext("\tlist [-Hp] [-r|-d max] [-o property[,...]] " "[-s property]...\n\t [-S property]... [-t type[,...]] " "[filesystem|volume|snapshot] ...\n")); case HELP_MOUNT: return (gettext("\tmount\n" "\tmount [-flvO] [-o opts] <-a | filesystem>\n")); case HELP_PROMOTE: return (gettext("\tpromote \n")); case HELP_RECEIVE: return (gettext("\treceive [-vMnsFhu] " "[-o =] ... [-x ] ...\n" "\t \n" "\treceive [-vMnsFhu] [-o =] ... " "[-x ] ... \n" "\t [-d | -e] \n" "\treceive -A \n")); case HELP_RENAME: return (gettext("\trename [-f] " "\n" "\trename -p [-f] \n" "\trename -u [-f] \n" "\trename -r \n")); case HELP_ROLLBACK: return (gettext("\trollback [-rRf] \n")); case HELP_SEND: return (gettext("\tsend [-DLPbcehnpsvw] " "[-i|-I snapshot]\n" "\t [-R [-X dataset[,dataset]...]] \n" "\tsend [-DnvPLecw] [-i snapshot|bookmark] " "\n" "\tsend [-DnPpvLec] [-i bookmark|snapshot] " "--redact \n" "\tsend [-nvPe] -t \n" "\tsend [-Pnv] --saved filesystem\n")); case HELP_SET: return (gettext("\tset ... " " ...\n")); case HELP_SHARE: return (gettext("\tshare [-l] <-a [nfs|smb] | filesystem>\n")); case HELP_SNAPSHOT: return (gettext("\tsnapshot [-r] [-o property=value] ... " "@ ...\n")); case HELP_UNMOUNT: return (gettext("\tunmount [-fu] " "<-a | filesystem|mountpoint>\n")); case HELP_UNSHARE: return (gettext("\tunshare " "<-a [nfs|smb] | filesystem|mountpoint>\n")); case HELP_ALLOW: return (gettext("\tallow \n" "\tallow [-ldug] " "<\"everyone\"|user|group>[,...] [,...]\n" "\t \n" "\tallow [-ld] -e [,...] " "\n" "\tallow -c [,...] \n" "\tallow -s @setname [,...] " "\n")); case HELP_UNALLOW: return (gettext("\tunallow [-rldug] " "<\"everyone\"|user|group>[,...]\n" "\t [[,...]] \n" "\tunallow [-rld] -e [[,...]] " "\n" "\tunallow [-r] -c [[,...]] " "\n" "\tunallow [-r] -s @setname [[,...]] " "\n")); case HELP_USERSPACE: return (gettext("\tuserspace [-Hinp] [-o field[,...]] " "[-s field] ...\n" "\t [-S field] ... [-t type[,...]] " "\n")); case HELP_GROUPSPACE: return (gettext("\tgroupspace [-Hinp] [-o field[,...]] " "[-s field] ...\n" "\t [-S field] ... [-t type[,...]] " "\n")); case HELP_PROJECTSPACE: return (gettext("\tprojectspace [-Hp] [-o field[,...]] " "[-s field] ... \n" "\t [-S field] ... \n")); case HELP_PROJECT: return (gettext("\tproject [-d|-r] \n" "\tproject -c [-0] [-d|-r] [-p id] \n" "\tproject -C [-k] [-r] \n" "\tproject [-p id] [-r] [-s] \n")); case HELP_HOLD: return (gettext("\thold [-r] ...\n")); case HELP_HOLDS: return (gettext("\tholds [-rH] ...\n")); case HELP_RELEASE: return (gettext("\trelease [-r] ...\n")); case HELP_DIFF: return (gettext("\tdiff [-FHt] " "[snapshot|filesystem]\n")); case HELP_BOOKMARK: return (gettext("\tbookmark " "\n")); case HELP_CHANNEL_PROGRAM: return (gettext("\tprogram [-jn] [-t ] " "[-m ]\n" "\t [lua args...]\n")); case HELP_LOAD_KEY: return (gettext("\tload-key [-rn] [-L ] " "<-a | filesystem|volume>\n")); case HELP_UNLOAD_KEY: return (gettext("\tunload-key [-r] " "<-a | filesystem|volume>\n")); case HELP_CHANGE_KEY: return (gettext("\tchange-key [-l] [-o keyformat=]\n" "\t [-o keylocation=] [-o pbkdf2iters=]\n" "\t \n" "\tchange-key -i [-l] \n")); case HELP_VERSION: return (gettext("\tversion\n")); case HELP_REDACT: return (gettext("\tredact " " ...\n")); case HELP_JAIL: return (gettext("\tjail \n")); case HELP_UNJAIL: return (gettext("\tunjail \n")); case HELP_WAIT: return (gettext("\twait [-t ] \n")); case HELP_ZONE: return (gettext("\tzone \n")); case HELP_UNZONE: return (gettext("\tunzone \n")); default: __builtin_unreachable(); } } void nomem(void) { (void) fprintf(stderr, gettext("internal error: out of memory\n")); exit(1); } /* * Utility function to guarantee malloc() success. */ void * safe_malloc(size_t size) { void *data; if ((data = calloc(1, size)) == NULL) nomem(); return (data); } static void * safe_realloc(void *data, size_t size) { void *newp; if ((newp = realloc(data, size)) == NULL) { free(data); nomem(); } return (newp); } static char * safe_strdup(const char *str) { char *dupstr = strdup(str); if (dupstr == NULL) nomem(); return (dupstr); } /* * Callback routine that will print out information for each of * the properties. */ static int usage_prop_cb(int prop, void *cb) { FILE *fp = cb; (void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop)); if (zfs_prop_readonly(prop)) (void) fprintf(fp, " NO "); else (void) fprintf(fp, "YES "); if (zfs_prop_inheritable(prop)) (void) fprintf(fp, " YES "); else (void) fprintf(fp, " NO "); (void) fprintf(fp, "%s\n", zfs_prop_values(prop) ?: "-"); return (ZPROP_CONT); } /* * Display usage message. If we're inside a command, display only the usage for * that command. Otherwise, iterate over the entire command table and display * a complete usage message. */ static __attribute__((noreturn)) void usage(boolean_t requested) { int i; boolean_t show_properties = B_FALSE; FILE *fp = requested ? stdout : stderr; if (current_command == NULL) { (void) fprintf(fp, gettext("usage: zfs command args ...\n")); (void) fprintf(fp, gettext("where 'command' is one of the following:\n\n")); for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) (void) fprintf(fp, "\n"); else (void) fprintf(fp, "%s", get_usage(command_table[i].usage)); } (void) fprintf(fp, gettext("\nEach dataset is of the form: " "pool/[dataset/]*dataset[@name]\n")); } else { (void) fprintf(fp, gettext("usage:\n")); (void) fprintf(fp, "%s", get_usage(current_command->usage)); } if (current_command != NULL && (strcmp(current_command->name, "set") == 0 || strcmp(current_command->name, "get") == 0 || strcmp(current_command->name, "inherit") == 0 || strcmp(current_command->name, "list") == 0)) show_properties = B_TRUE; if (show_properties) { (void) fprintf(fp, gettext("\nThe following properties are supported:\n")); (void) fprintf(fp, "\n\t%-14s %s %s %s\n\n", "PROPERTY", "EDIT", "INHERIT", "VALUES"); /* Iterate over all properties */ (void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_DATASET); (void) fprintf(fp, "\t%-15s ", "userused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "groupused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "projectused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "userobjused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "groupobjused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "projectobjused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "userquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "groupquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "projectquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "userobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "groupobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "projectobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "written@"); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "written#"); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, gettext("\nSizes are specified in bytes " "with standard units such as K, M, G, etc.\n")); (void) fprintf(fp, "%s", gettext("\nUser-defined properties " "can be specified by using a name containing a colon " "(:).\n")); (void) fprintf(fp, gettext("\nThe {user|group|project}" "[obj]{used|quota}@ properties must be appended with\n" "a user|group|project specifier of one of these forms:\n" " POSIX name (eg: \"matt\")\n" " POSIX id (eg: \"126829\")\n" " SMB name@domain (eg: \"matt@sun\")\n" " SMB SID (eg: \"S-1-234-567-89\")\n")); } else { (void) fprintf(fp, gettext("\nFor the property list, run: %s\n"), "zfs set|get"); (void) fprintf(fp, gettext("\nFor the delegated permission list, run: %s\n"), "zfs allow|unallow"); } /* * See comments at end of main(). */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } exit(requested ? 0 : 2); } /* * Take a property=value argument string and add it to the given nvlist. * Modifies the argument inplace. */ static boolean_t parseprop(nvlist_t *props, char *propname) { char *propval; if ((propval = strchr(propname, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for property=value argument\n")); return (B_FALSE); } *propval = '\0'; propval++; if (nvlist_exists(props, propname)) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (B_FALSE); } if (nvlist_add_string(props, propname, propval) != 0) nomem(); return (B_TRUE); } /* * Take a property name argument and add it to the given nvlist. * Modifies the argument inplace. */ static boolean_t parsepropname(nvlist_t *props, char *propname) { if (strchr(propname, '=') != NULL) { (void) fprintf(stderr, gettext("invalid character " "'=' in property argument\n")); return (B_FALSE); } if (nvlist_exists(props, propname)) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (B_FALSE); } if (nvlist_add_boolean(props, propname) != 0) nomem(); return (B_TRUE); } static int parse_depth(char *opt, int *flags) { char *tmp; int depth; depth = (int)strtol(opt, &tmp, 0); if (*tmp) { (void) fprintf(stderr, gettext("%s is not an integer\n"), optarg); usage(B_FALSE); } if (depth < 0) { (void) fprintf(stderr, gettext("Depth can not be negative.\n")); usage(B_FALSE); } *flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE); return (depth); } #define PROGRESS_DELAY 2 /* seconds */ static const char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"; static time_t pt_begin; static char *pt_header = NULL; static boolean_t pt_shown; static void start_progress_timer(void) { pt_begin = time(NULL) + PROGRESS_DELAY; pt_shown = B_FALSE; } static void set_progress_header(const char *header) { assert(pt_header == NULL); pt_header = safe_strdup(header); if (pt_shown) { (void) printf("%s: ", header); (void) fflush(stdout); } } static void update_progress(const char *update) { if (!pt_shown && time(NULL) > pt_begin) { int len = strlen(update); (void) printf("%s: %s%*.*s", pt_header, update, len, len, pt_reverse); (void) fflush(stdout); pt_shown = B_TRUE; } else if (pt_shown) { int len = strlen(update); (void) printf("%s%*.*s", update, len, len, pt_reverse); (void) fflush(stdout); } } static void finish_progress(const char *done) { if (pt_shown) { (void) puts(done); (void) fflush(stdout); } free(pt_header); pt_header = NULL; } static int zfs_mount_and_share(libzfs_handle_t *hdl, const char *dataset, zfs_type_t type) { zfs_handle_t *zhp = NULL; int ret = 0; zhp = zfs_open(hdl, dataset, type); if (zhp == NULL) return (1); /* * Volumes may neither be mounted or shared. Potentially in the * future filesystems detected on these volumes could be mounted. */ if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME) { zfs_close(zhp); return (0); } /* * Mount and/or share the new filesystem as appropriate. We provide a * verbose error message to let the user know that their filesystem was * in fact created, even if we failed to mount or share it. * * If the user doesn't want the dataset automatically mounted, then * skip the mount/share step */ if (zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, type, B_FALSE) && zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON) { if (zfs_mount_delegation_check()) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but it may only be " "mounted by root\n")); ret = 1; } else if (zfs_mount(zhp, NULL, 0) != 0) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but not mounted\n")); ret = 1; } else if (zfs_share(zhp, NULL) != 0) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but not shared\n")); ret = 1; } zfs_commit_shares(NULL); } zfs_close(zhp); return (ret); } /* * zfs clone [-p] [-o prop=value] ... * * Given an existing dataset, create a writable copy whose initial contents * are the same as the source. The newly created dataset maintains a * dependency on the original; the original cannot be destroyed so long as * the clone exists. * * The '-p' flag creates all the non-existing ancestors of the target first. */ static int zfs_do_clone(int argc, char **argv) { zfs_handle_t *zhp = NULL; boolean_t parents = B_FALSE; nvlist_t *props; int ret = 0; int c; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, "o:p")) != -1) { switch (c) { case 'o': if (!parseprop(props, optarg)) { nvlist_free(props); return (1); } break; case 'p': parents = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing source dataset " "argument\n")); goto usage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing target dataset " "argument\n")); goto usage; } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); goto usage; } /* open the source dataset */ if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) { nvlist_free(props); return (1); } if (parents && zfs_name_valid(argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { /* * Now create the ancestors of the target dataset. If the * target already exists and '-p' option was used we should not * complain. */ if (zfs_dataset_exists(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { zfs_close(zhp); nvlist_free(props); return (0); } if (zfs_create_ancestors(g_zfs, argv[1]) != 0) { zfs_close(zhp); nvlist_free(props); return (1); } } /* pass to libzfs */ ret = zfs_clone(zhp, argv[1], props); /* create the mountpoint if necessary */ if (ret == 0) { if (log_history) { (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } ret = zfs_mount_and_share(g_zfs, argv[1], ZFS_TYPE_DATASET); } zfs_close(zhp); nvlist_free(props); return (!!ret); usage: ASSERT3P(zhp, ==, NULL); nvlist_free(props); usage(B_FALSE); return (-1); } /* * Return a default volblocksize for the pool which always uses more than * half of the data sectors. This primarily applies to dRAID which always * writes full stripe widths. */ static uint64_t default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) { uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; nvlist_t *tree, **vdevs; uint_t nvdevs; nvlist_t *config = zpool_get_config(zhp, NULL); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &vdevs, &nvdevs) != 0) { return (ZVOL_DEFAULT_BLOCKSIZE); } for (int i = 0; i < nvdevs; i++) { nvlist_t *nv = vdevs[i]; uint64_t ashift, ndata, nparity; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &ashift) != 0) continue; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, &ndata) == 0) { /* dRAID minimum allocation width */ asize = MAX(asize, ndata * (1ULL << ashift)); } else if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { /* raidz minimum allocation width */ if (nparity == 1) asize = MAX(asize, 2 * (1ULL << ashift)); else asize = MAX(asize, 4 * (1ULL << ashift)); } else { /* mirror or (non-redundant) leaf vdev */ asize = MAX(asize, 1ULL << ashift); } } /* * Calculate the target volblocksize such that more than half * of the asize is used. The following table is for 4k sectors. * * n asize blksz used | n asize blksz used * -------------------------+--------------------------------- * 1 4,096 8,192 100% | 9 36,864 32,768 88% * 2 8,192 8,192 100% | 10 40,960 32,768 80% * 3 12,288 8,192 66% | 11 45,056 32,768 72% * 4 16,384 16,384 100% | 12 49,152 32,768 66% * 5 20,480 16,384 80% | 13 53,248 32,768 61% * 6 24,576 16,384 66% | 14 57,344 32,768 57% * 7 28,672 16,384 57% | 15 61,440 32,768 53% * 8 32,768 32,768 100% | 16 65,536 65,636 100% * * This is primarily a concern for dRAID which always allocates * a full stripe width. For dRAID the default stripe width is * n=8 in which case the volblocksize is set to 32k. Ignoring * compression there are no unused sectors. This same reasoning * applies to raidz[2,3] so target 4 sectors to minimize waste. */ uint64_t tgt_volblocksize = ZVOL_DEFAULT_BLOCKSIZE; while (tgt_volblocksize * 2 <= asize) tgt_volblocksize *= 2; const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE); if (nvlist_lookup_uint64(props, prop, &volblocksize) == 0) { /* Issue a warning when a non-optimal size is requested. */ if (volblocksize < ZVOL_DEFAULT_BLOCKSIZE) { (void) fprintf(stderr, gettext("Warning: " "volblocksize (%llu) is less than the default " "minimum block size (%llu).\nTo reduce wasted " "space a volblocksize of %llu is recommended.\n"), (u_longlong_t)volblocksize, (u_longlong_t)ZVOL_DEFAULT_BLOCKSIZE, (u_longlong_t)tgt_volblocksize); } else if (volblocksize < tgt_volblocksize) { (void) fprintf(stderr, gettext("Warning: " "volblocksize (%llu) is much less than the " "minimum allocation\nunit (%llu), which wastes " "at least %llu%% of space. To reduce wasted " "space,\nuse a larger volblocksize (%llu is " "recommended), fewer dRAID data disks\n" "per group, or smaller sector size (ashift).\n"), (u_longlong_t)volblocksize, (u_longlong_t)asize, (u_longlong_t)((100 * (asize - volblocksize)) / asize), (u_longlong_t)tgt_volblocksize); } } else { volblocksize = tgt_volblocksize; fnvlist_add_uint64(props, prop, volblocksize); } return (volblocksize); } /* * zfs create [-Pnpv] [-o prop=value] ... fs * zfs create [-Pnpsv] [-b blocksize] [-o prop=value] ... -V vol size * * Create a new dataset. This command can be used to create filesystems * and volumes. Snapshot creation is handled by 'zfs snapshot'. * For volumes, the user must specify a size to be used. * * The '-s' flag applies only to volumes, and indicates that we should not try * to set the reservation for this volume. By default we set a reservation * equal to the size for any volume. For pools with SPA_VERSION >= * SPA_VERSION_REFRESERVATION, we set a refreservation instead. * * The '-p' flag creates all the non-existing ancestors of the target first. * * The '-n' flag is no-op (dry run) mode. This will perform a user-space sanity * check of arguments and properties, but does not check for permissions, * available space, etc. * * The '-u' flag prevents the newly created file system from being mounted. * * The '-v' flag is for verbose output. * * The '-P' flag is used for parseable output. It implies '-v'. */ static int zfs_do_create(int argc, char **argv) { zfs_type_t type = ZFS_TYPE_FILESYSTEM; zpool_handle_t *zpool_handle = NULL; nvlist_t *real_props = NULL; uint64_t volsize = 0; int c; boolean_t noreserve = B_FALSE; boolean_t bflag = B_FALSE; boolean_t parents = B_FALSE; boolean_t dryrun = B_FALSE; boolean_t nomount = B_FALSE; boolean_t verbose = B_FALSE; boolean_t parseable = B_FALSE; int ret = 1; nvlist_t *props; uint64_t intval; char *strval; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, ":PV:b:nso:puv")) != -1) { switch (c) { case 'V': type = ZFS_TYPE_VOLUME; if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { (void) fprintf(stderr, gettext("bad volume " "size '%s': %s\n"), optarg, libzfs_error_description(g_zfs)); goto error; } if (nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0) nomem(); volsize = intval; break; case 'P': verbose = B_TRUE; parseable = B_TRUE; break; case 'p': parents = B_TRUE; break; case 'b': bflag = B_TRUE; if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { (void) fprintf(stderr, gettext("bad volume " "block size '%s': %s\n"), optarg, libzfs_error_description(g_zfs)); goto error; } if (nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), intval) != 0) nomem(); break; case 'n': dryrun = B_TRUE; break; case 'o': if (!parseprop(props, optarg)) goto error; break; case 's': noreserve = B_TRUE; break; case 'u': nomount = B_TRUE; break; case 'v': verbose = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing size " "argument\n")); goto badusage; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto badusage; } } if ((bflag || noreserve) && type != ZFS_TYPE_VOLUME) { (void) fprintf(stderr, gettext("'-s' and '-b' can only be " "used when creating a volume\n")); goto badusage; } if (nomount && type != ZFS_TYPE_FILESYSTEM) { (void) fprintf(stderr, gettext("'-u' can only be " "used when creating a filesystem\n")); goto badusage; } argc -= optind; argv += optind; /* check number of arguments */ if (argc == 0) { (void) fprintf(stderr, gettext("missing %s argument\n"), zfs_type_to_name(type)); goto badusage; } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); goto badusage; } if (dryrun || type == ZFS_TYPE_VOLUME) { char msg[ZFS_MAX_DATASET_NAME_LEN * 2]; char *p; if ((p = strchr(argv[0], '/')) != NULL) *p = '\0'; zpool_handle = zpool_open(g_zfs, argv[0]); if (p != NULL) *p = '/'; if (zpool_handle == NULL) goto error; (void) snprintf(msg, sizeof (msg), dryrun ? gettext("cannot verify '%s'") : gettext("cannot create '%s'"), argv[0]); if (props && (real_props = zfs_valid_proplist(g_zfs, type, props, 0, NULL, zpool_handle, B_TRUE, msg)) == NULL) { zpool_close(zpool_handle); goto error; } } if (type == ZFS_TYPE_VOLUME) { const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE); uint64_t volblocksize = default_volblocksize(zpool_handle, real_props); if (volblocksize != ZVOL_DEFAULT_BLOCKSIZE && nvlist_lookup_string(props, prop, &strval) != 0) { if (asprintf(&strval, "%llu", (u_longlong_t)volblocksize) == -1) nomem(); nvlist_add_string(props, prop, strval); free(strval); } /* * If volsize is not a multiple of volblocksize, round it * up to the nearest multiple of the volblocksize. */ if (volsize % volblocksize) { volsize = P2ROUNDUP_TYPED(volsize, volblocksize, uint64_t); if (nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLSIZE), volsize) != 0) { nvlist_free(props); nomem(); } } } if (type == ZFS_TYPE_VOLUME && !noreserve) { uint64_t spa_version; zfs_prop_t resv_prop; spa_version = zpool_get_prop_int(zpool_handle, ZPOOL_PROP_VERSION, NULL); if (spa_version >= SPA_VERSION_REFRESERVATION) resv_prop = ZFS_PROP_REFRESERVATION; else resv_prop = ZFS_PROP_RESERVATION; volsize = zvol_volsize_to_reservation(zpool_handle, volsize, real_props); if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop), &strval) != 0) { if (nvlist_add_uint64(props, zfs_prop_to_name(resv_prop), volsize) != 0) { nvlist_free(props); nomem(); } } } if (zpool_handle != NULL) { zpool_close(zpool_handle); nvlist_free(real_props); } if (parents && zfs_name_valid(argv[0], type)) { /* * Now create the ancestors of target dataset. If the target * already exists and '-p' option was used we should not * complain. */ if (zfs_dataset_exists(g_zfs, argv[0], type)) { ret = 0; goto error; } if (verbose) { (void) printf(parseable ? "create_ancestors\t%s\n" : dryrun ? "would create ancestors of %s\n" : "create ancestors of %s\n", argv[0]); } if (!dryrun) { if (zfs_create_ancestors(g_zfs, argv[0]) != 0) { goto error; } } } if (verbose) { nvpair_t *nvp = NULL; (void) printf(parseable ? "create\t%s\n" : dryrun ? "would create %s\n" : "create %s\n", argv[0]); while ((nvp = nvlist_next_nvpair(props, nvp)) != NULL) { uint64_t uval; char *sval; switch (nvpair_type(nvp)) { case DATA_TYPE_UINT64: VERIFY0(nvpair_value_uint64(nvp, &uval)); (void) printf(parseable ? "property\t%s\t%llu\n" : "\t%s=%llu\n", nvpair_name(nvp), (u_longlong_t)uval); break; case DATA_TYPE_STRING: VERIFY0(nvpair_value_string(nvp, &sval)); (void) printf(parseable ? "property\t%s\t%s\n" : "\t%s=%s\n", nvpair_name(nvp), sval); break; default: (void) fprintf(stderr, "property '%s' " "has illegal type %d\n", nvpair_name(nvp), nvpair_type(nvp)); abort(); } } } if (dryrun) { ret = 0; goto error; } /* pass to libzfs */ if (zfs_create(g_zfs, argv[0], type, props) != 0) goto error; if (log_history) { (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (nomount) { ret = 0; goto error; } ret = zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET); error: nvlist_free(props); return (ret); badusage: nvlist_free(props); usage(B_FALSE); return (2); } /* * zfs destroy [-rRf] * zfs destroy [-rRd] * * -r Recursively destroy all children * -R Recursively destroy all dependents, including clones * -f Force unmounting of any dependents * -d If we can't destroy now, mark for deferred destruction * * Destroys the given dataset. By default, it will unmount any filesystems, * and refuse to destroy a dataset that has any dependents. A dependent can * either be a child, or a clone of a child. */ typedef struct destroy_cbdata { boolean_t cb_first; boolean_t cb_force; boolean_t cb_recurse; boolean_t cb_error; boolean_t cb_doclones; zfs_handle_t *cb_target; boolean_t cb_defer_destroy; boolean_t cb_verbose; boolean_t cb_parsable; boolean_t cb_dryrun; nvlist_t *cb_nvl; nvlist_t *cb_batchedsnaps; /* first snap in contiguous run */ char *cb_firstsnap; /* previous snap in contiguous run */ char *cb_prevsnap; int64_t cb_snapused; char *cb_snapspec; char *cb_bookmark; uint64_t cb_snap_count; } destroy_cbdata_t; /* * Check for any dependents based on the '-r' or '-R' flags. */ static int destroy_check_dependent(zfs_handle_t *zhp, void *data) { destroy_cbdata_t *cbp = data; const char *tname = zfs_get_name(cbp->cb_target); const char *name = zfs_get_name(zhp); if (strncmp(tname, name, strlen(tname)) == 0 && (name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) { /* * This is a direct descendant, not a clone somewhere else in * the hierarchy. */ if (cbp->cb_recurse) goto out; if (cbp->cb_first) { (void) fprintf(stderr, gettext("cannot destroy '%s': " "%s has children\n"), zfs_get_name(cbp->cb_target), zfs_type_to_name(zfs_get_type(cbp->cb_target))); (void) fprintf(stderr, gettext("use '-r' to destroy " "the following datasets:\n")); cbp->cb_first = B_FALSE; cbp->cb_error = B_TRUE; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); } else { /* * This is a clone. We only want to report this if the '-r' * wasn't specified, or the target is a snapshot. */ if (!cbp->cb_recurse && zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT) goto out; if (cbp->cb_first) { (void) fprintf(stderr, gettext("cannot destroy '%s': " "%s has dependent clones\n"), zfs_get_name(cbp->cb_target), zfs_type_to_name(zfs_get_type(cbp->cb_target))); (void) fprintf(stderr, gettext("use '-R' to destroy " "the following datasets:\n")); cbp->cb_first = B_FALSE; cbp->cb_error = B_TRUE; cbp->cb_dryrun = B_TRUE; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); } out: zfs_close(zhp); return (0); } static int destroy_batched(destroy_cbdata_t *cb) { int error = zfs_destroy_snaps_nvl(g_zfs, cb->cb_batchedsnaps, B_FALSE); fnvlist_free(cb->cb_batchedsnaps); cb->cb_batchedsnaps = fnvlist_alloc(); return (error); } static int destroy_callback(zfs_handle_t *zhp, void *data) { destroy_cbdata_t *cb = data; const char *name = zfs_get_name(zhp); int error; if (cb->cb_verbose) { if (cb->cb_parsable) { (void) printf("destroy\t%s\n", name); } else if (cb->cb_dryrun) { (void) printf(gettext("would destroy %s\n"), name); } else { (void) printf(gettext("will destroy %s\n"), name); } } /* * Ignore pools (which we've already flagged as an error before getting * here). */ if (strchr(zfs_get_name(zhp), '/') == NULL && zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { zfs_close(zhp); return (0); } if (cb->cb_dryrun) { zfs_close(zhp); return (0); } /* * We batch up all contiguous snapshots (even of different * filesystems) and destroy them with one ioctl. We can't * simply do all snap deletions and then all fs deletions, * because we must delete a clone before its origin. */ if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) { cb->cb_snap_count++; fnvlist_add_boolean(cb->cb_batchedsnaps, name); if (cb->cb_snap_count % 10 == 0 && cb->cb_defer_destroy) error = destroy_batched(cb); } else { error = destroy_batched(cb); if (error != 0 || zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 || zfs_destroy(zhp, cb->cb_defer_destroy) != 0) { zfs_close(zhp); /* * When performing a recursive destroy we ignore errors * so that the recursive destroy could continue * destroying past problem datasets */ if (cb->cb_recurse) { cb->cb_error = B_TRUE; return (0); } return (-1); } } zfs_close(zhp); return (0); } static int destroy_print_cb(zfs_handle_t *zhp, void *arg) { destroy_cbdata_t *cb = arg; const char *name = zfs_get_name(zhp); int err = 0; if (nvlist_exists(cb->cb_nvl, name)) { if (cb->cb_firstsnap == NULL) cb->cb_firstsnap = strdup(name); if (cb->cb_prevsnap != NULL) free(cb->cb_prevsnap); /* this snap continues the current range */ cb->cb_prevsnap = strdup(name); if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL) nomem(); if (cb->cb_verbose) { if (cb->cb_parsable) { (void) printf("destroy\t%s\n", name); } else if (cb->cb_dryrun) { (void) printf(gettext("would destroy %s\n"), name); } else { (void) printf(gettext("will destroy %s\n"), name); } } } else if (cb->cb_firstsnap != NULL) { /* end of this range */ uint64_t used = 0; err = lzc_snaprange_space(cb->cb_firstsnap, cb->cb_prevsnap, &used); cb->cb_snapused += used; free(cb->cb_firstsnap); cb->cb_firstsnap = NULL; free(cb->cb_prevsnap); cb->cb_prevsnap = NULL; } zfs_close(zhp); return (err); } static int destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb) { int err; assert(cb->cb_firstsnap == NULL); assert(cb->cb_prevsnap == NULL); err = zfs_iter_snapshots_sorted(fs_zhp, destroy_print_cb, cb, 0, 0); if (cb->cb_firstsnap != NULL) { uint64_t used = 0; if (err == 0) { err = lzc_snaprange_space(cb->cb_firstsnap, cb->cb_prevsnap, &used); } cb->cb_snapused += used; free(cb->cb_firstsnap); cb->cb_firstsnap = NULL; free(cb->cb_prevsnap); cb->cb_prevsnap = NULL; } return (err); } static int snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg) { destroy_cbdata_t *cb = arg; int err = 0; /* Check for clones. */ if (!cb->cb_doclones && !cb->cb_defer_destroy) { cb->cb_target = zhp; cb->cb_first = B_TRUE; err = zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, cb); } if (err == 0) { if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp))) nomem(); } zfs_close(zhp); return (err); } static int gather_snapshots(zfs_handle_t *zhp, void *arg) { destroy_cbdata_t *cb = arg; int err = 0; err = zfs_iter_snapspec(zhp, cb->cb_snapspec, snapshot_to_nvl_cb, cb); if (err == ENOENT) err = 0; if (err != 0) goto out; if (cb->cb_verbose) { err = destroy_print_snapshots(zhp, cb); if (err != 0) goto out; } if (cb->cb_recurse) err = zfs_iter_filesystems(zhp, gather_snapshots, cb); out: zfs_close(zhp); return (err); } static int destroy_clones(destroy_cbdata_t *cb) { nvpair_t *pair; for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL); pair != NULL; pair = nvlist_next_nvpair(cb->cb_nvl, pair)) { zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair), ZFS_TYPE_SNAPSHOT); if (zhp != NULL) { boolean_t defer = cb->cb_defer_destroy; int err; /* * We can't defer destroy non-snapshots, so set it to * false while destroying the clones. */ cb->cb_defer_destroy = B_FALSE; err = zfs_iter_dependents(zhp, B_FALSE, destroy_callback, cb); cb->cb_defer_destroy = defer; zfs_close(zhp); if (err != 0) return (err); } } return (0); } static int zfs_do_destroy(int argc, char **argv) { destroy_cbdata_t cb = { 0 }; int rv = 0; int err = 0; int c; zfs_handle_t *zhp = NULL; char *at, *pound; zfs_type_t type = ZFS_TYPE_DATASET; /* check options */ while ((c = getopt(argc, argv, "vpndfrR")) != -1) { switch (c) { case 'v': cb.cb_verbose = B_TRUE; break; case 'p': cb.cb_verbose = B_TRUE; cb.cb_parsable = B_TRUE; break; case 'n': cb.cb_dryrun = B_TRUE; break; case 'd': cb.cb_defer_destroy = B_TRUE; type = ZFS_TYPE_SNAPSHOT; break; case 'f': cb.cb_force = B_TRUE; break; case 'r': cb.cb_recurse = B_TRUE; break; case 'R': cb.cb_recurse = B_TRUE; cb.cb_doclones = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc == 0) { (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } at = strchr(argv[0], '@'); pound = strchr(argv[0], '#'); if (at != NULL) { /* Build the list of snaps to destroy in cb_nvl. */ cb.cb_nvl = fnvlist_alloc(); *at = '\0'; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { nvlist_free(cb.cb_nvl); return (1); } cb.cb_snapspec = at + 1; if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 || cb.cb_error) { rv = 1; goto out; } if (nvlist_empty(cb.cb_nvl)) { (void) fprintf(stderr, gettext("could not find any " "snapshots to destroy; check snapshot names.\n")); rv = 1; goto out; } if (cb.cb_verbose) { char buf[16]; zfs_nicebytes(cb.cb_snapused, buf, sizeof (buf)); if (cb.cb_parsable) { (void) printf("reclaim\t%llu\n", (u_longlong_t)cb.cb_snapused); } else if (cb.cb_dryrun) { (void) printf(gettext("would reclaim %s\n"), buf); } else { (void) printf(gettext("will reclaim %s\n"), buf); } } if (!cb.cb_dryrun) { if (cb.cb_doclones) { cb.cb_batchedsnaps = fnvlist_alloc(); err = destroy_clones(&cb); if (err == 0) { err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_batchedsnaps, B_FALSE); } if (err != 0) { rv = 1; goto out; } } if (err == 0) { err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl, cb.cb_defer_destroy); } } if (err != 0) rv = 1; } else if (pound != NULL) { int err; nvlist_t *nvl; if (cb.cb_dryrun) { (void) fprintf(stderr, "dryrun is not supported with bookmark\n"); return (-1); } if (cb.cb_defer_destroy) { (void) fprintf(stderr, "defer destroy is not supported with bookmark\n"); return (-1); } if (cb.cb_recurse) { (void) fprintf(stderr, "recursive is not supported with bookmark\n"); return (-1); } /* * Unfortunately, zfs_bookmark() doesn't honor the * casesensitivity setting. However, we can't simply * remove this check, because lzc_destroy_bookmarks() * ignores non-existent bookmarks, so this is necessary * to get a proper error message. */ if (!zfs_bookmark_exists(argv[0])) { (void) fprintf(stderr, gettext("bookmark '%s' " "does not exist.\n"), argv[0]); return (1); } nvl = fnvlist_alloc(); fnvlist_add_boolean(nvl, argv[0]); err = lzc_destroy_bookmarks(nvl, NULL); if (err != 0) { (void) zfs_standard_error(g_zfs, err, "cannot destroy bookmark"); } nvlist_free(nvl); return (err); } else { /* Open the given dataset */ if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL) return (1); cb.cb_target = zhp; /* * Perform an explicit check for pools before going any further. */ if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL && zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { (void) fprintf(stderr, gettext("cannot destroy '%s': " "operation does not apply to pools\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use 'zfs destroy -r " "%s' to destroy all datasets in the pool\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use 'zpool destroy %s' " "to destroy the pool itself\n"), zfs_get_name(zhp)); rv = 1; goto out; } /* * Check for any dependents and/or clones. */ cb.cb_first = B_TRUE; if (!cb.cb_doclones && zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, &cb) != 0) { rv = 1; goto out; } if (cb.cb_error) { rv = 1; goto out; } cb.cb_batchedsnaps = fnvlist_alloc(); if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0) { rv = 1; goto out; } /* * Do the real thing. The callback will close the * handle regardless of whether it succeeds or not. */ err = destroy_callback(zhp, &cb); zhp = NULL; if (err == 0) { err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_batchedsnaps, cb.cb_defer_destroy); } if (err != 0 || cb.cb_error == B_TRUE) rv = 1; } out: fnvlist_free(cb.cb_batchedsnaps); fnvlist_free(cb.cb_nvl); if (zhp != NULL) zfs_close(zhp); return (rv); } static boolean_t is_recvd_column(zprop_get_cbdata_t *cbp) { int i; zfs_get_column_t col; for (i = 0; i < ZFS_GET_NCOLS && (col = cbp->cb_columns[i]) != GET_COL_NONE; i++) if (col == GET_COL_RECVD) return (B_TRUE); return (B_FALSE); } /* * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...] * < all | property[,property]... > < fs | snap | vol > ... * * -r recurse over any child datasets * -H scripted mode. Headers are stripped, and fields are separated * by tabs instead of spaces. * -o Set of fields to display. One of "name,property,value, * received,source". Default is "name,property,value,source". * "all" is an alias for all five. * -s Set of sources to allow. One of * "local,default,inherited,received,temporary,none". Default is * all six. * -p Display values in parsable (literal) format. * * Prints properties for the given datasets. The user can control which * columns to display as well as which property types to allow. */ /* * Invoked to display the properties for a single dataset. */ static int get_callback(zfs_handle_t *zhp, void *data) { char buf[ZFS_MAXPROPLEN]; char rbuf[ZFS_MAXPROPLEN]; zprop_source_t sourcetype; char source[ZFS_MAX_DATASET_NAME_LEN]; zprop_get_cbdata_t *cbp = data; nvlist_t *user_props = zfs_get_user_props(zhp); zprop_list_t *pl = cbp->cb_proplist; nvlist_t *propval; const char *strval; const char *sourceval; boolean_t received = is_recvd_column(cbp); for (; pl != NULL; pl = pl->pl_next) { char *recvdval = NULL; /* * Skip the special fake placeholder. This will also skip over * the name property when 'all' is specified. */ if (pl->pl_prop == ZFS_PROP_NAME && pl == cbp->cb_proplist) continue; if (pl->pl_prop != ZPROP_USERPROP) { if (zfs_prop_get(zhp, pl->pl_prop, buf, sizeof (buf), &sourcetype, source, sizeof (source), cbp->cb_literal) != 0) { if (pl->pl_all) continue; if (!zfs_prop_valid_for_type(pl->pl_prop, ZFS_TYPE_DATASET, B_FALSE)) { (void) fprintf(stderr, gettext("No such property '%s'\n"), zfs_prop_to_name(pl->pl_prop)); continue; } sourcetype = ZPROP_SRC_NONE; (void) strlcpy(buf, "-", sizeof (buf)); } if (received && (zfs_prop_get_recvd(zhp, zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf), cbp->cb_literal) == 0)) recvdval = rbuf; zprop_print_one_property(zfs_get_name(zhp), cbp, zfs_prop_to_name(pl->pl_prop), buf, sourcetype, source, recvdval); } else if (zfs_prop_userquota(pl->pl_user_prop)) { sourcetype = ZPROP_SRC_LOCAL; if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, buf, sizeof (buf), cbp->cb_literal) != 0) { sourcetype = ZPROP_SRC_NONE; (void) strlcpy(buf, "-", sizeof (buf)); } zprop_print_one_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, buf, sourcetype, source, NULL); } else if (zfs_prop_written(pl->pl_user_prop)) { sourcetype = ZPROP_SRC_LOCAL; if (zfs_prop_get_written(zhp, pl->pl_user_prop, buf, sizeof (buf), cbp->cb_literal) != 0) { sourcetype = ZPROP_SRC_NONE; (void) strlcpy(buf, "-", sizeof (buf)); } zprop_print_one_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, buf, sourcetype, source, NULL); } else { if (nvlist_lookup_nvlist(user_props, pl->pl_user_prop, &propval) != 0) { if (pl->pl_all) continue; sourcetype = ZPROP_SRC_NONE; strval = "-"; } else { strval = fnvlist_lookup_string(propval, ZPROP_VALUE); sourceval = fnvlist_lookup_string(propval, ZPROP_SOURCE); if (strcmp(sourceval, zfs_get_name(zhp)) == 0) { sourcetype = ZPROP_SRC_LOCAL; } else if (strcmp(sourceval, ZPROP_SOURCE_VAL_RECVD) == 0) { sourcetype = ZPROP_SRC_RECEIVED; } else { sourcetype = ZPROP_SRC_INHERITED; (void) strlcpy(source, sourceval, sizeof (source)); } } if (received && (zfs_prop_get_recvd(zhp, pl->pl_user_prop, rbuf, sizeof (rbuf), cbp->cb_literal) == 0)) recvdval = rbuf; zprop_print_one_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, strval, sourcetype, source, recvdval); } } return (0); } static int zfs_do_get(int argc, char **argv) { zprop_get_cbdata_t cb = { 0 }; int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS; int types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK; char *fields; int ret = 0; int limit = 0; zprop_list_t fake_name = { 0 }; /* * Set up default columns and sources. */ cb.cb_sources = ZPROP_SRC_ALL; cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; cb.cb_type = ZFS_TYPE_DATASET; /* check options */ while ((c = getopt(argc, argv, ":d:o:s:rt:Hp")) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; break; case 'd': limit = parse_depth(optarg, &flags); break; case 'r': flags |= ZFS_ITER_RECURSE; break; case 'H': cb.cb_scripted = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case 'o': /* * Process the set of columns to display. We zero out * the structure to give us a blank slate. */ memset(&cb.cb_columns, 0, sizeof (cb.cb_columns)); i = 0; for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const col_subopts[] = { "name", "property", "value", "received", "source", "all" }; static const zfs_get_column_t col_subopt_col[] = { GET_COL_NAME, GET_COL_PROPERTY, GET_COL_VALUE, GET_COL_RECVD, GET_COL_SOURCE }; static const int col_subopt_flags[] = { 0, 0, 0, ZFS_ITER_RECVD_PROPS, 0 }; if (i == ZFS_GET_NCOLS) { (void) fprintf(stderr, gettext("too " "many fields given to -o " "option\n")); usage(B_FALSE); } for (c = 0; c < ARRAY_SIZE(col_subopts); ++c) if (strcmp(tok, col_subopts[c]) == 0) goto found; (void) fprintf(stderr, gettext("invalid column name '%s'\n"), tok); usage(B_FALSE); found: if (c >= 5) { if (i > 0) { (void) fprintf(stderr, gettext("\"all\" conflicts " "with specific fields " "given to -o option\n")); usage(B_FALSE); } memcpy(cb.cb_columns, col_subopt_col, sizeof (col_subopt_col)); flags |= ZFS_ITER_RECVD_PROPS; i = ZFS_GET_NCOLS; } else { cb.cb_columns[i++] = col_subopt_col[c]; flags |= col_subopt_flags[c]; } } break; case 's': cb.cb_sources = 0; for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const source_opt[] = { "local", "default", "inherited", "received", "temporary", "none" }; static const int source_flg[] = { ZPROP_SRC_LOCAL, ZPROP_SRC_DEFAULT, ZPROP_SRC_INHERITED, ZPROP_SRC_RECEIVED, ZPROP_SRC_TEMPORARY, ZPROP_SRC_NONE }; for (i = 0; i < ARRAY_SIZE(source_opt); ++i) if (strcmp(tok, source_opt[i]) == 0) { cb.cb_sources |= source_flg[i]; goto found2; } (void) fprintf(stderr, gettext("invalid source '%s'\n"), tok); usage(B_FALSE); found2:; } break; case 't': types = 0; flags &= ~ZFS_ITER_PROP_LISTSNAPS; for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const type_opts[] = { "filesystem", "volume", "snapshot", "snap", "bookmark", "all" }; static const int type_types[] = { ZFS_TYPE_FILESYSTEM, ZFS_TYPE_VOLUME, ZFS_TYPE_SNAPSHOT, ZFS_TYPE_SNAPSHOT, ZFS_TYPE_BOOKMARK, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK }; for (i = 0; i < ARRAY_SIZE(type_opts); ++i) if (strcmp(tok, type_opts[i]) == 0) { types |= type_types[i]; goto found3; } (void) fprintf(stderr, gettext("invalid type '%s'\n"), tok); usage(B_FALSE); found3:; } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing property " "argument\n")); usage(B_FALSE); } fields = argv[0]; /* * Handle users who want to get all snapshots or bookmarks * of a dataset (ex. 'zfs get -t snapshot refer '). */ if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) && argc > 1 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) { flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE); limit = 1; } if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) != 0) usage(B_FALSE); argc--; argv++; /* * As part of zfs_expand_proplist(), we keep track of the maximum column * width for each property. For the 'NAME' (and 'SOURCE') columns, we * need to know the maximum name length. However, the user likely did * not specify 'name' as one of the properties to fetch, so we need to * make sure we always include at least this property for * print_get_headers() to work properly. */ if (cb.cb_proplist != NULL) { fake_name.pl_prop = ZFS_PROP_NAME; fake_name.pl_width = strlen(gettext("NAME")); fake_name.pl_next = cb.cb_proplist; cb.cb_proplist = &fake_name; } cb.cb_first = B_TRUE; /* run for each object */ ret = zfs_for_each(argc, argv, flags, types, NULL, &cb.cb_proplist, limit, get_callback, &cb); if (cb.cb_proplist == &fake_name) zprop_free_list(fake_name.pl_next); else zprop_free_list(cb.cb_proplist); return (ret); } /* * inherit [-rS] ... * * -r Recurse over all children * -S Revert to received value, if any * * For each dataset specified on the command line, inherit the given property * from its parent. Inheriting a property at the pool level will cause it to * use the default value. The '-r' flag will recurse over all children, and is * useful for setting a property on a hierarchy-wide basis, regardless of any * local modifications for each dataset. */ typedef struct inherit_cbdata { const char *cb_propname; boolean_t cb_received; } inherit_cbdata_t; static int inherit_recurse_cb(zfs_handle_t *zhp, void *data) { inherit_cbdata_t *cb = data; zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname); /* * If we're doing it recursively, then ignore properties that * are not valid for this type of dataset. */ if (prop != ZPROP_INVAL && !zfs_prop_valid_for_type(prop, zfs_get_type(zhp), B_FALSE)) return (0); return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); } static int inherit_cb(zfs_handle_t *zhp, void *data) { inherit_cbdata_t *cb = data; return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); } static int zfs_do_inherit(int argc, char **argv) { int c; zfs_prop_t prop; inherit_cbdata_t cb = { 0 }; char *propname; int ret = 0; int flags = 0; boolean_t received = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "rS")) != -1) { switch (c) { case 'r': flags |= ZFS_ITER_RECURSE; break; case 'S': received = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing property argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } propname = argv[0]; argc--; argv++; if ((prop = zfs_name_to_prop(propname)) != ZPROP_USERPROP) { if (zfs_prop_readonly(prop)) { (void) fprintf(stderr, gettext( "%s property is read-only\n"), propname); return (1); } if (!zfs_prop_inheritable(prop) && !received) { (void) fprintf(stderr, gettext("'%s' property cannot " "be inherited\n"), propname); if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION || prop == ZFS_PROP_REFQUOTA || prop == ZFS_PROP_REFRESERVATION) { (void) fprintf(stderr, gettext("use 'zfs set " "%s=none' to clear\n"), propname); (void) fprintf(stderr, gettext("use 'zfs " "inherit -S %s' to revert to received " "value\n"), propname); } return (1); } if (received && (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION)) { (void) fprintf(stderr, gettext("'%s' property cannot " "be reverted to a received value\n"), propname); return (1); } } else if (!zfs_prop_user(propname)) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), propname); usage(B_FALSE); } cb.cb_propname = propname; cb.cb_received = received; if (flags & ZFS_ITER_RECURSE) { ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL, NULL, 0, inherit_recurse_cb, &cb); } else { ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL, NULL, 0, inherit_cb, &cb); } return (ret); } typedef struct upgrade_cbdata { uint64_t cb_numupgraded; uint64_t cb_numsamegraded; uint64_t cb_numfailed; uint64_t cb_version; boolean_t cb_newer; boolean_t cb_foundone; char cb_lastfs[ZFS_MAX_DATASET_NAME_LEN]; } upgrade_cbdata_t; static int same_pool(zfs_handle_t *zhp, const char *name) { int len1 = strcspn(name, "/@"); const char *zhname = zfs_get_name(zhp); int len2 = strcspn(zhname, "/@"); if (len1 != len2) return (B_FALSE); return (strncmp(name, zhname, len1) == 0); } static int upgrade_list_callback(zfs_handle_t *zhp, void *data) { upgrade_cbdata_t *cb = data; int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); /* list if it's old/new */ if ((!cb->cb_newer && version < ZPL_VERSION) || (cb->cb_newer && version > ZPL_VERSION)) { char *str; if (cb->cb_newer) { str = gettext("The following filesystems are " "formatted using a newer software version and\n" "cannot be accessed on the current system.\n\n"); } else { str = gettext("The following filesystems are " "out of date, and can be upgraded. After being\n" "upgraded, these filesystems (and any 'zfs send' " "streams generated from\n" "subsequent snapshots) will no longer be " "accessible by older software versions.\n\n"); } if (!cb->cb_foundone) { (void) puts(str); (void) printf(gettext("VER FILESYSTEM\n")); (void) printf(gettext("--- ------------\n")); cb->cb_foundone = B_TRUE; } (void) printf("%2u %s\n", version, zfs_get_name(zhp)); } return (0); } static int upgrade_set_callback(zfs_handle_t *zhp, void *data) { upgrade_cbdata_t *cb = data; int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); int needed_spa_version; int spa_version; if (zfs_spa_version(zhp, &spa_version) < 0) return (-1); needed_spa_version = zfs_spa_version_map(cb->cb_version); if (needed_spa_version < 0) return (-1); if (spa_version < needed_spa_version) { /* can't upgrade */ (void) printf(gettext("%s: can not be " "upgraded; the pool version needs to first " "be upgraded\nto version %d\n\n"), zfs_get_name(zhp), needed_spa_version); cb->cb_numfailed++; return (0); } /* upgrade */ if (version < cb->cb_version) { char verstr[24]; (void) snprintf(verstr, sizeof (verstr), "%llu", (u_longlong_t)cb->cb_version); if (cb->cb_lastfs[0] && !same_pool(zhp, cb->cb_lastfs)) { /* * If they did "zfs upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (zfs_prop_set(zhp, "version", verstr) == 0) cb->cb_numupgraded++; else cb->cb_numfailed++; (void) strcpy(cb->cb_lastfs, zfs_get_name(zhp)); } else if (version > cb->cb_version) { /* can't downgrade */ (void) printf(gettext("%s: can not be downgraded; " "it is already at version %u\n"), zfs_get_name(zhp), version); cb->cb_numfailed++; } else { cb->cb_numsamegraded++; } return (0); } /* * zfs upgrade * zfs upgrade -v * zfs upgrade [-r] [-V ] <-a | filesystem> */ static int zfs_do_upgrade(int argc, char **argv) { boolean_t all = B_FALSE; boolean_t showversions = B_FALSE; int ret = 0; upgrade_cbdata_t cb = { 0 }; int c; int flags = ZFS_ITER_ARGS_CAN_BE_PATHS; /* check options */ while ((c = getopt(argc, argv, "rvV:a")) != -1) { switch (c) { case 'r': flags |= ZFS_ITER_RECURSE; break; case 'v': showversions = B_TRUE; break; case 'V': if (zfs_prop_string_to_index(ZFS_PROP_VERSION, optarg, &cb.cb_version) != 0) { (void) fprintf(stderr, gettext("invalid version %s\n"), optarg); usage(B_FALSE); } break; case 'a': all = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if ((!all && !argc) && ((flags & ZFS_ITER_RECURSE) | cb.cb_version)) usage(B_FALSE); if (showversions && (flags & ZFS_ITER_RECURSE || all || cb.cb_version || argc)) usage(B_FALSE); if ((all || argc) && (showversions)) usage(B_FALSE); if (all && argc) usage(B_FALSE); if (showversions) { /* Show info on available versions. */ (void) printf(gettext("The following filesystem versions are " "supported:\n\n")); (void) printf(gettext("VER DESCRIPTION\n")); (void) printf("--- -----------------------------------------" "---------------\n"); (void) printf(gettext(" 1 Initial ZFS filesystem version\n")); (void) printf(gettext(" 2 Enhanced directory entries\n")); (void) printf(gettext(" 3 Case insensitive and filesystem " "user identifier (FUID)\n")); (void) printf(gettext(" 4 userquota, groupquota " "properties\n")); (void) printf(gettext(" 5 System attributes\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases,\n")); (void) printf("see the ZFS Administration Guide.\n\n"); ret = 0; } else if (argc || all) { /* Upgrade filesystems */ if (cb.cb_version == 0) cb.cb_version = ZPL_VERSION; ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM, NULL, NULL, 0, upgrade_set_callback, &cb); (void) printf(gettext("%llu filesystems upgraded\n"), (u_longlong_t)cb.cb_numupgraded); if (cb.cb_numsamegraded) { (void) printf(gettext("%llu filesystems already at " "this version\n"), (u_longlong_t)cb.cb_numsamegraded); } if (cb.cb_numfailed != 0) ret = 1; } else { /* List old-version filesystems */ boolean_t found; (void) printf(gettext("This system is currently running " "ZFS filesystem version %llu.\n\n"), ZPL_VERSION); flags |= ZFS_ITER_RECURSE; ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, NULL, NULL, 0, upgrade_list_callback, &cb); found = cb.cb_foundone; cb.cb_foundone = B_FALSE; cb.cb_newer = B_TRUE; ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, NULL, NULL, 0, upgrade_list_callback, &cb); if (!cb.cb_foundone && !found) { (void) printf(gettext("All filesystems are " "formatted with the current version.\n")); } } return (ret); } /* * zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...] * [-S field [-S field]...] [-t type[,...]] * filesystem | snapshot | path * zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...] * [-S field [-S field]...] [-t type[,...]] * filesystem | snapshot | path * zfs projectspace [-Hp] [-o field[,...]] [-s field [-s field]...] * [-S field [-S field]...] filesystem | snapshot | path * * -H Scripted mode; elide headers and separate columns by tabs. * -i Translate SID to POSIX ID. * -n Print numeric ID instead of user/group name. * -o Control which fields to display. * -p Use exact (parsable) numeric output. * -s Specify sort columns, descending order. * -S Specify sort columns, ascending order. * -t Control which object types to display. * * Displays space consumed by, and quotas on, each user in the specified * filesystem or snapshot. */ /* us_field_types, us_field_hdr and us_field_names should be kept in sync */ enum us_field_types { USFIELD_TYPE, USFIELD_NAME, USFIELD_USED, USFIELD_QUOTA, USFIELD_OBJUSED, USFIELD_OBJQUOTA }; static const char *const us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA", "OBJUSED", "OBJQUOTA" }; static const char *const us_field_names[] = { "type", "name", "used", "quota", "objused", "objquota" }; #define USFIELD_LAST (sizeof (us_field_names) / sizeof (char *)) #define USTYPE_PSX_GRP (1 << 0) #define USTYPE_PSX_USR (1 << 1) #define USTYPE_SMB_GRP (1 << 2) #define USTYPE_SMB_USR (1 << 3) #define USTYPE_PROJ (1 << 4) #define USTYPE_ALL \ (USTYPE_PSX_GRP | USTYPE_PSX_USR | USTYPE_SMB_GRP | USTYPE_SMB_USR | \ USTYPE_PROJ) static int us_type_bits[] = { USTYPE_PSX_GRP, USTYPE_PSX_USR, USTYPE_SMB_GRP, USTYPE_SMB_USR, USTYPE_ALL }; static const char *const us_type_names[] = { "posixgroup", "posixuser", "smbgroup", "smbuser", "all" }; typedef struct us_node { nvlist_t *usn_nvl; uu_avl_node_t usn_avlnode; uu_list_node_t usn_listnode; } us_node_t; typedef struct us_cbdata { nvlist_t **cb_nvlp; uu_avl_pool_t *cb_avl_pool; uu_avl_t *cb_avl; boolean_t cb_numname; boolean_t cb_nicenum; boolean_t cb_sid2posix; zfs_userquota_prop_t cb_prop; zfs_sort_column_t *cb_sortcol; size_t cb_width[USFIELD_LAST]; } us_cbdata_t; static boolean_t us_populated = B_FALSE; typedef struct { zfs_sort_column_t *si_sortcol; boolean_t si_numname; } us_sort_info_t; static int us_field_index(const char *field) { for (int i = 0; i < USFIELD_LAST; i++) { if (strcmp(field, us_field_names[i]) == 0) return (i); } return (-1); } static int us_compare(const void *larg, const void *rarg, void *unused) { const us_node_t *l = larg; const us_node_t *r = rarg; us_sort_info_t *si = (us_sort_info_t *)unused; zfs_sort_column_t *sortcol = si->si_sortcol; boolean_t numname = si->si_numname; nvlist_t *lnvl = l->usn_nvl; nvlist_t *rnvl = r->usn_nvl; int rc = 0; boolean_t lvb, rvb; for (; sortcol != NULL; sortcol = sortcol->sc_next) { char *lvstr = (char *)""; char *rvstr = (char *)""; uint32_t lv32 = 0; uint32_t rv32 = 0; uint64_t lv64 = 0; uint64_t rv64 = 0; zfs_prop_t prop = sortcol->sc_prop; const char *propname = NULL; boolean_t reverse = sortcol->sc_reverse; switch (prop) { case ZFS_PROP_TYPE: propname = "type"; (void) nvlist_lookup_uint32(lnvl, propname, &lv32); (void) nvlist_lookup_uint32(rnvl, propname, &rv32); if (rv32 != lv32) rc = (rv32 < lv32) ? 1 : -1; break; case ZFS_PROP_NAME: propname = "name"; if (numname) { compare_nums: (void) nvlist_lookup_uint64(lnvl, propname, &lv64); (void) nvlist_lookup_uint64(rnvl, propname, &rv64); if (rv64 != lv64) rc = (rv64 < lv64) ? 1 : -1; } else { if ((nvlist_lookup_string(lnvl, propname, &lvstr) == ENOENT) || (nvlist_lookup_string(rnvl, propname, &rvstr) == ENOENT)) { goto compare_nums; } rc = strcmp(lvstr, rvstr); } break; case ZFS_PROP_USED: case ZFS_PROP_QUOTA: if (!us_populated) break; if (prop == ZFS_PROP_USED) propname = "used"; else propname = "quota"; (void) nvlist_lookup_uint64(lnvl, propname, &lv64); (void) nvlist_lookup_uint64(rnvl, propname, &rv64); if (rv64 != lv64) rc = (rv64 < lv64) ? 1 : -1; break; default: break; } if (rc != 0) { if (rc < 0) return (reverse ? 1 : -1); else return (reverse ? -1 : 1); } } /* * If entries still seem to be the same, check if they are of the same * type (smbentity is added only if we are doing SID to POSIX ID * translation where we can have duplicate type/name combinations). */ if (nvlist_lookup_boolean_value(lnvl, "smbentity", &lvb) == 0 && nvlist_lookup_boolean_value(rnvl, "smbentity", &rvb) == 0 && lvb != rvb) return (lvb < rvb ? -1 : 1); return (0); } static boolean_t zfs_prop_is_user(unsigned p) { return (p == ZFS_PROP_USERUSED || p == ZFS_PROP_USERQUOTA || p == ZFS_PROP_USEROBJUSED || p == ZFS_PROP_USEROBJQUOTA); } static boolean_t zfs_prop_is_group(unsigned p) { return (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA || p == ZFS_PROP_GROUPOBJUSED || p == ZFS_PROP_GROUPOBJQUOTA); } static boolean_t zfs_prop_is_project(unsigned p) { return (p == ZFS_PROP_PROJECTUSED || p == ZFS_PROP_PROJECTQUOTA || p == ZFS_PROP_PROJECTOBJUSED || p == ZFS_PROP_PROJECTOBJQUOTA); } static inline const char * us_type2str(unsigned field_type) { switch (field_type) { case USTYPE_PSX_USR: return ("POSIX User"); case USTYPE_PSX_GRP: return ("POSIX Group"); case USTYPE_SMB_USR: return ("SMB User"); case USTYPE_SMB_GRP: return ("SMB Group"); case USTYPE_PROJ: return ("Project"); default: return ("Undefined"); } } static int userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space) { us_cbdata_t *cb = (us_cbdata_t *)arg; zfs_userquota_prop_t prop = cb->cb_prop; char *name = NULL; const char *propname; char sizebuf[32]; us_node_t *node; uu_avl_pool_t *avl_pool = cb->cb_avl_pool; uu_avl_t *avl = cb->cb_avl; uu_avl_index_t idx; nvlist_t *props; us_node_t *n; zfs_sort_column_t *sortcol = cb->cb_sortcol; unsigned type = 0; const char *typestr; size_t namelen; size_t typelen; size_t sizelen; int typeidx, nameidx, sizeidx; us_sort_info_t sortinfo = { sortcol, cb->cb_numname }; boolean_t smbentity = B_FALSE; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); node = safe_malloc(sizeof (us_node_t)); uu_avl_node_init(node, &node->usn_avlnode, avl_pool); node->usn_nvl = props; if (domain != NULL && domain[0] != '\0') { #ifdef HAVE_IDMAP /* SMB */ char sid[MAXNAMELEN + 32]; uid_t id; uint64_t classes; int err; directory_error_t e; smbentity = B_TRUE; (void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid); if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) { type = USTYPE_SMB_GRP; err = sid_to_id(sid, B_FALSE, &id); } else { type = USTYPE_SMB_USR; err = sid_to_id(sid, B_TRUE, &id); } if (err == 0) { rid = id; if (!cb->cb_sid2posix) { e = directory_name_from_sid(NULL, sid, &name, &classes); if (e != NULL) directory_error_free(e); if (name == NULL) name = sid; } } #else nvlist_free(props); free(node); return (-1); #endif /* HAVE_IDMAP */ } if (cb->cb_sid2posix || domain == NULL || domain[0] == '\0') { /* POSIX or -i */ if (zfs_prop_is_group(prop)) { type = USTYPE_PSX_GRP; if (!cb->cb_numname) { struct group *g; if ((g = getgrgid(rid)) != NULL) name = g->gr_name; } } else if (zfs_prop_is_user(prop)) { type = USTYPE_PSX_USR; if (!cb->cb_numname) { struct passwd *p; if ((p = getpwuid(rid)) != NULL) name = p->pw_name; } } else { type = USTYPE_PROJ; } } /* * Make sure that the type/name combination is unique when doing * SID to POSIX ID translation (hence changing the type from SMB to * POSIX). */ if (cb->cb_sid2posix && nvlist_add_boolean_value(props, "smbentity", smbentity) != 0) nomem(); /* Calculate/update width of TYPE field */ typestr = us_type2str(type); typelen = strlen(gettext(typestr)); typeidx = us_field_index("type"); if (typelen > cb->cb_width[typeidx]) cb->cb_width[typeidx] = typelen; if (nvlist_add_uint32(props, "type", type) != 0) nomem(); /* Calculate/update width of NAME field */ if ((cb->cb_numname && cb->cb_sid2posix) || name == NULL) { if (nvlist_add_uint64(props, "name", rid) != 0) nomem(); namelen = snprintf(NULL, 0, "%u", rid); } else { if (nvlist_add_string(props, "name", name) != 0) nomem(); namelen = strlen(name); } nameidx = us_field_index("name"); if (nameidx >= 0 && namelen > cb->cb_width[nameidx]) cb->cb_width[nameidx] = namelen; /* * Check if this type/name combination is in the list and update it; * otherwise add new node to the list. */ if ((n = uu_avl_find(avl, node, &sortinfo, &idx)) == NULL) { uu_avl_insert(avl, node, idx); } else { nvlist_free(props); free(node); node = n; props = node->usn_nvl; } /* Calculate/update width of USED/QUOTA fields */ if (cb->cb_nicenum) { if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_USERQUOTA || prop == ZFS_PROP_GROUPQUOTA || prop == ZFS_PROP_PROJECTUSED || prop == ZFS_PROP_PROJECTQUOTA) { zfs_nicebytes(space, sizebuf, sizeof (sizebuf)); } else { zfs_nicenum(space, sizebuf, sizeof (sizebuf)); } } else { (void) snprintf(sizebuf, sizeof (sizebuf), "%llu", (u_longlong_t)space); } sizelen = strlen(sizebuf); if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_PROJECTUSED) { propname = "used"; if (!nvlist_exists(props, "quota")) (void) nvlist_add_uint64(props, "quota", 0); } else if (prop == ZFS_PROP_USERQUOTA || prop == ZFS_PROP_GROUPQUOTA || prop == ZFS_PROP_PROJECTQUOTA) { propname = "quota"; if (!nvlist_exists(props, "used")) (void) nvlist_add_uint64(props, "used", 0); } else if (prop == ZFS_PROP_USEROBJUSED || prop == ZFS_PROP_GROUPOBJUSED || prop == ZFS_PROP_PROJECTOBJUSED) { propname = "objused"; if (!nvlist_exists(props, "objquota")) (void) nvlist_add_uint64(props, "objquota", 0); } else if (prop == ZFS_PROP_USEROBJQUOTA || prop == ZFS_PROP_GROUPOBJQUOTA || prop == ZFS_PROP_PROJECTOBJQUOTA) { propname = "objquota"; if (!nvlist_exists(props, "objused")) (void) nvlist_add_uint64(props, "objused", 0); } else { return (-1); } sizeidx = us_field_index(propname); if (sizeidx >= 0 && sizelen > cb->cb_width[sizeidx]) cb->cb_width[sizeidx] = sizelen; if (nvlist_add_uint64(props, propname, space) != 0) nomem(); return (0); } static void print_us_node(boolean_t scripted, boolean_t parsable, int *fields, int types, size_t *width, us_node_t *node) { nvlist_t *nvl = node->usn_nvl; char valstr[MAXNAMELEN]; boolean_t first = B_TRUE; int cfield = 0; int field; uint32_t ustype; /* Check type */ (void) nvlist_lookup_uint32(nvl, "type", &ustype); if (!(ustype & types)) return; while ((field = fields[cfield]) != USFIELD_LAST) { nvpair_t *nvp = NULL; data_type_t type; uint32_t val32 = -1; uint64_t val64 = -1; const char *strval = "-"; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) if (strcmp(nvpair_name(nvp), us_field_names[field]) == 0) break; type = nvp == NULL ? DATA_TYPE_UNKNOWN : nvpair_type(nvp); switch (type) { case DATA_TYPE_UINT32: val32 = fnvpair_value_uint32(nvp); break; case DATA_TYPE_UINT64: val64 = fnvpair_value_uint64(nvp); break; case DATA_TYPE_STRING: strval = fnvpair_value_string(nvp); break; case DATA_TYPE_UNKNOWN: break; default: (void) fprintf(stderr, "invalid data type\n"); } switch (field) { case USFIELD_TYPE: if (type == DATA_TYPE_UINT32) strval = us_type2str(val32); break; case USFIELD_NAME: if (type == DATA_TYPE_UINT64) { (void) sprintf(valstr, "%llu", (u_longlong_t)val64); strval = valstr; } break; case USFIELD_USED: case USFIELD_QUOTA: if (type == DATA_TYPE_UINT64) { if (parsable) { (void) sprintf(valstr, "%llu", (u_longlong_t)val64); strval = valstr; } else if (field == USFIELD_QUOTA && val64 == 0) { strval = "none"; } else { zfs_nicebytes(val64, valstr, sizeof (valstr)); strval = valstr; } } break; case USFIELD_OBJUSED: case USFIELD_OBJQUOTA: if (type == DATA_TYPE_UINT64) { if (parsable) { (void) sprintf(valstr, "%llu", (u_longlong_t)val64); strval = valstr; } else if (field == USFIELD_OBJQUOTA && val64 == 0) { strval = "none"; } else { zfs_nicenum(val64, valstr, sizeof (valstr)); strval = valstr; } } break; } if (!first) { if (scripted) (void) putchar('\t'); else (void) fputs(" ", stdout); } if (scripted) (void) fputs(strval, stdout); else if (field == USFIELD_TYPE || field == USFIELD_NAME) (void) printf("%-*s", (int)width[field], strval); else (void) printf("%*s", (int)width[field], strval); first = B_FALSE; cfield++; } (void) putchar('\n'); } static void print_us(boolean_t scripted, boolean_t parsable, int *fields, int types, size_t *width, boolean_t rmnode, uu_avl_t *avl) { us_node_t *node; const char *col; int cfield = 0; int field; if (!scripted) { boolean_t first = B_TRUE; while ((field = fields[cfield]) != USFIELD_LAST) { col = gettext(us_field_hdr[field]); if (field == USFIELD_TYPE || field == USFIELD_NAME) { (void) printf(first ? "%-*s" : " %-*s", (int)width[field], col); } else { (void) printf(first ? "%*s" : " %*s", (int)width[field], col); } first = B_FALSE; cfield++; } (void) printf("\n"); } for (node = uu_avl_first(avl); node; node = uu_avl_next(avl, node)) { print_us_node(scripted, parsable, fields, types, width, node); if (rmnode) nvlist_free(node->usn_nvl); } } static int zfs_do_userspace(int argc, char **argv) { zfs_handle_t *zhp; zfs_userquota_prop_t p; uu_avl_pool_t *avl_pool; uu_avl_t *avl_tree; uu_avl_walk_t *walk; char *delim; char deffields[] = "type,name,used,quota,objused,objquota"; char *ofield = NULL; char *tfield = NULL; int cfield = 0; int fields[256]; int i; boolean_t scripted = B_FALSE; boolean_t prtnum = B_FALSE; boolean_t parsable = B_FALSE; boolean_t sid2posix = B_FALSE; int ret = 0; int c; zfs_sort_column_t *sortcol = NULL; int types = USTYPE_PSX_USR | USTYPE_SMB_USR; us_cbdata_t cb; us_node_t *node; us_node_t *rmnode; uu_list_pool_t *listpool; uu_list_t *list; uu_avl_index_t idx = 0; uu_list_index_t idx2 = 0; if (argc < 2) usage(B_FALSE); if (strcmp(argv[0], "groupspace") == 0) { /* Toggle default group types */ types = USTYPE_PSX_GRP | USTYPE_SMB_GRP; } else if (strcmp(argv[0], "projectspace") == 0) { types = USTYPE_PROJ; prtnum = B_TRUE; } while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) { switch (c) { case 'n': if (types == USTYPE_PROJ) { (void) fprintf(stderr, gettext("invalid option 'n'\n")); usage(B_FALSE); } prtnum = B_TRUE; break; case 'H': scripted = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 'o': ofield = optarg; break; case 's': case 'S': if (zfs_add_sort_column(&sortcol, optarg, c == 's' ? B_FALSE : B_TRUE) != 0) { (void) fprintf(stderr, gettext("invalid field '%s'\n"), optarg); usage(B_FALSE); } break; case 't': if (types == USTYPE_PROJ) { (void) fprintf(stderr, gettext("invalid option 't'\n")); usage(B_FALSE); } tfield = optarg; break; case 'i': if (types == USTYPE_PROJ) { (void) fprintf(stderr, gettext("invalid option 'i'\n")); usage(B_FALSE); } sid2posix = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing dataset name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* Use default output fields if not specified using -o */ if (ofield == NULL) ofield = deffields; do { if ((delim = strchr(ofield, ',')) != NULL) *delim = '\0'; if ((fields[cfield++] = us_field_index(ofield)) == -1) { (void) fprintf(stderr, gettext("invalid type '%s' " "for -o option\n"), ofield); return (-1); } if (delim != NULL) ofield = delim + 1; } while (delim != NULL); fields[cfield] = USFIELD_LAST; /* Override output types (-t option) */ if (tfield != NULL) { types = 0; do { boolean_t found = B_FALSE; if ((delim = strchr(tfield, ',')) != NULL) *delim = '\0'; for (i = 0; i < sizeof (us_type_bits) / sizeof (int); i++) { if (strcmp(tfield, us_type_names[i]) == 0) { found = B_TRUE; types |= us_type_bits[i]; break; } } if (!found) { (void) fprintf(stderr, gettext("invalid type " "'%s' for -t option\n"), tfield); return (-1); } if (delim != NULL) tfield = delim + 1; } while (delim != NULL); } if ((zhp = zfs_path_to_zhandle(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT)) == NULL) return (1); if (zfs_get_underlying_type(zhp) != ZFS_TYPE_FILESYSTEM) { (void) fprintf(stderr, gettext("operation is only applicable " "to filesystems and their snapshots\n")); zfs_close(zhp); return (1); } if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t), offsetof(us_node_t, usn_avlnode), us_compare, UU_DEFAULT)) == NULL) nomem(); if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) nomem(); /* Always add default sorting columns */ (void) zfs_add_sort_column(&sortcol, "type", B_FALSE); (void) zfs_add_sort_column(&sortcol, "name", B_FALSE); cb.cb_sortcol = sortcol; cb.cb_numname = prtnum; cb.cb_nicenum = !parsable; cb.cb_avl_pool = avl_pool; cb.cb_avl = avl_tree; cb.cb_sid2posix = sid2posix; for (i = 0; i < USFIELD_LAST; i++) cb.cb_width[i] = strlen(gettext(us_field_hdr[i])); for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) { if ((zfs_prop_is_user(p) && !(types & (USTYPE_PSX_USR | USTYPE_SMB_USR))) || (zfs_prop_is_group(p) && !(types & (USTYPE_PSX_GRP | USTYPE_SMB_GRP))) || (zfs_prop_is_project(p) && types != USTYPE_PROJ)) continue; cb.cb_prop = p; if ((ret = zfs_userspace(zhp, p, userspace_cb, &cb)) != 0) { zfs_close(zhp); return (ret); } } zfs_close(zhp); /* Sort the list */ if ((node = uu_avl_first(avl_tree)) == NULL) return (0); us_populated = B_TRUE; listpool = uu_list_pool_create("tmplist", sizeof (us_node_t), offsetof(us_node_t, usn_listnode), NULL, UU_DEFAULT); list = uu_list_create(listpool, NULL, UU_DEFAULT); uu_list_node_init(node, &node->usn_listnode, listpool); while (node != NULL) { rmnode = node; node = uu_avl_next(avl_tree, node); uu_avl_remove(avl_tree, rmnode); if (uu_list_find(list, rmnode, NULL, &idx2) == NULL) uu_list_insert(list, rmnode, idx2); } for (node = uu_list_first(list); node != NULL; node = uu_list_next(list, node)) { us_sort_info_t sortinfo = { sortcol, cb.cb_numname }; if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == NULL) uu_avl_insert(avl_tree, node, idx); } uu_list_destroy(list); uu_list_pool_destroy(listpool); /* Print and free node nvlist memory */ print_us(scripted, parsable, fields, types, cb.cb_width, B_TRUE, cb.cb_avl); zfs_free_sort_columns(sortcol); /* Clean up the AVL tree */ if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) nomem(); while ((node = uu_avl_walk_next(walk)) != NULL) { uu_avl_remove(cb.cb_avl, node); free(node); } uu_avl_walk_end(walk); uu_avl_destroy(avl_tree); uu_avl_pool_destroy(avl_pool); return (ret); } /* * list [-Hp][-r|-d max] [-o property[,...]] [-s property] ... [-S property] * [-t type[,...]] [filesystem|volume|snapshot] ... * * -H Scripted mode; elide headers and separate columns by tabs * -p Display values in parsable (literal) format. * -r Recurse over all children * -d Limit recursion by depth. * -o Control which fields to display. * -s Specify sort columns, descending order. * -S Specify sort columns, ascending order. * -t Control which object types to display. * * When given no arguments, list all filesystems in the system. * Otherwise, list the specified datasets, optionally recursing down them if * '-r' is specified. */ typedef struct list_cbdata { boolean_t cb_first; boolean_t cb_literal; boolean_t cb_scripted; zprop_list_t *cb_proplist; } list_cbdata_t; /* * Given a list of columns to display, output appropriate headers for each one. */ static void print_header(list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; char headerbuf[ZFS_MAXPROPLEN]; const char *header; int i; boolean_t first = B_TRUE; boolean_t right_justify; for (; pl != NULL; pl = pl->pl_next) { if (!first) { (void) printf(" "); } else { first = B_FALSE; } right_justify = B_FALSE; if (pl->pl_prop != ZPROP_USERPROP) { header = zfs_prop_column_name(pl->pl_prop); right_justify = zfs_prop_align_right(pl->pl_prop); } else { for (i = 0; pl->pl_user_prop[i] != '\0'; i++) headerbuf[i] = toupper(pl->pl_user_prop[i]); headerbuf[i] = '\0'; header = headerbuf; } if (pl->pl_next == NULL && !right_justify) (void) printf("%s", header); else if (right_justify) (void) printf("%*s", (int)pl->pl_width, header); else (void) printf("%-*s", (int)pl->pl_width, header); } (void) printf("\n"); } /* * Given a dataset and a list of fields, print out all the properties according * to the described layout. */ static void print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; boolean_t first = B_TRUE; char property[ZFS_MAXPROPLEN]; nvlist_t *userprops = zfs_get_user_props(zhp); nvlist_t *propval; const char *propstr; boolean_t right_justify; for (; pl != NULL; pl = pl->pl_next) { if (!first) { if (cb->cb_scripted) (void) putchar('\t'); else (void) fputs(" ", stdout); } else { first = B_FALSE; } if (pl->pl_prop == ZFS_PROP_NAME) { (void) strlcpy(property, zfs_get_name(zhp), sizeof (property)); propstr = property; right_justify = zfs_prop_align_right(pl->pl_prop); } else if (pl->pl_prop != ZPROP_USERPROP) { if (zfs_prop_get(zhp, pl->pl_prop, property, sizeof (property), NULL, NULL, 0, cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = zfs_prop_align_right(pl->pl_prop); } else if (zfs_prop_userquota(pl->pl_user_prop)) { if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, property, sizeof (property), cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = B_TRUE; } else if (zfs_prop_written(pl->pl_user_prop)) { if (zfs_prop_get_written(zhp, pl->pl_user_prop, property, sizeof (property), cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = B_TRUE; } else { if (nvlist_lookup_nvlist(userprops, pl->pl_user_prop, &propval) != 0) propstr = "-"; else propstr = fnvlist_lookup_string(propval, ZPROP_VALUE); right_justify = B_FALSE; } /* * If this is being called in scripted mode, or if this is the * last column and it is left-justified, don't include a width * format specifier. */ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) (void) fputs(propstr, stdout); else if (right_justify) (void) printf("%*s", (int)pl->pl_width, propstr); else (void) printf("%-*s", (int)pl->pl_width, propstr); } (void) putchar('\n'); } /* * Generic callback function to list a dataset or snapshot. */ static int list_callback(zfs_handle_t *zhp, void *data) { list_cbdata_t *cbp = data; if (cbp->cb_first) { if (!cbp->cb_scripted) print_header(cbp); cbp->cb_first = B_FALSE; } print_dataset(zhp, cbp); return (0); } static int zfs_do_list(int argc, char **argv) { int c; char default_fields[] = "name,used,available,referenced,mountpoint"; int types = ZFS_TYPE_DATASET; boolean_t types_specified = B_FALSE; char *fields = default_fields; list_cbdata_t cb = { 0 }; int limit = 0; int ret = 0; zfs_sort_column_t *sortcol = NULL; int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS; /* check options */ while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) { switch (c) { case 'o': fields = optarg; break; case 'p': cb.cb_literal = B_TRUE; flags |= ZFS_ITER_LITERAL_PROPS; break; case 'd': limit = parse_depth(optarg, &flags); break; case 'r': flags |= ZFS_ITER_RECURSE; break; case 'H': cb.cb_scripted = B_TRUE; break; case 's': if (zfs_add_sort_column(&sortcol, optarg, B_FALSE) != 0) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), optarg); usage(B_FALSE); } break; case 'S': if (zfs_add_sort_column(&sortcol, optarg, B_TRUE) != 0) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), optarg); usage(B_FALSE); } break; case 't': types = 0; types_specified = B_TRUE; flags &= ~ZFS_ITER_PROP_LISTSNAPS; for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const type_subopts[] = { "filesystem", "volume", "snapshot", "snap", "bookmark", "all" }; static const int type_types[] = { ZFS_TYPE_FILESYSTEM, ZFS_TYPE_VOLUME, ZFS_TYPE_SNAPSHOT, ZFS_TYPE_SNAPSHOT, ZFS_TYPE_BOOKMARK, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK }; for (c = 0; c < ARRAY_SIZE(type_subopts); ++c) if (strcmp(tok, type_subopts[c]) == 0) { types |= type_types[c]; goto found3; } (void) fprintf(stderr, gettext("invalid type '%s'\n"), tok); usage(B_FALSE); found3:; } break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* * If we are only going to list snapshot names and sort by name or * by createtxg, then we can use faster version. */ if (strcmp(fields, "name") == 0 && (zfs_sort_only_by_name(sortcol) || zfs_sort_only_by_createtxg(sortcol))) { flags |= ZFS_ITER_SIMPLE; } /* * If "-o space" and no types were specified, don't display snapshots. */ if (strcmp(fields, "space") == 0 && types_specified == B_FALSE) types &= ~ZFS_TYPE_SNAPSHOT; /* * Handle users who want to list all snapshots or bookmarks * of the current dataset (ex. 'zfs list -t snapshot '). */ if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) && argc > 0 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) { flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE); limit = 1; } /* * If the user specifies '-o all', the zprop_get_list() doesn't * normally include the name of the dataset. For 'zfs list', we always * want this property to be first. */ if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) != 0) usage(B_FALSE); cb.cb_first = B_TRUE; ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist, limit, list_callback, &cb); zprop_free_list(cb.cb_proplist); zfs_free_sort_columns(sortcol); if (ret == 0 && cb.cb_first && !cb.cb_scripted) (void) fprintf(stderr, gettext("no datasets available\n")); return (ret); } /* * zfs rename [-fu] * zfs rename [-f] -p * zfs rename [-u] -r * * Renames the given dataset to another of the same type. * * The '-p' flag creates all the non-existing ancestors of the target first. * The '-u' flag prevents file systems from being remounted during rename. */ static int zfs_do_rename(int argc, char **argv) { zfs_handle_t *zhp; renameflags_t flags = { 0 }; int c; int ret = 0; int types; boolean_t parents = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "pruf")) != -1) { switch (c) { case 'p': parents = B_TRUE; break; case 'r': flags.recursive = B_TRUE; break; case 'u': flags.nounmount = B_TRUE; break; case 'f': flags.forceunmount = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing source dataset " "argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing target dataset " "argument\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (flags.recursive && parents) { (void) fprintf(stderr, gettext("-p and -r options are mutually " "exclusive\n")); usage(B_FALSE); } if (flags.nounmount && parents) { (void) fprintf(stderr, gettext("-u and -p options are mutually " "exclusive\n")); usage(B_FALSE); } if (flags.recursive && strchr(argv[0], '@') == 0) { (void) fprintf(stderr, gettext("source dataset for recursive " "rename must be a snapshot\n")); usage(B_FALSE); } if (flags.nounmount) types = ZFS_TYPE_FILESYSTEM; else if (parents) types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; else types = ZFS_TYPE_DATASET; if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL) return (1); /* If we were asked and the name looks good, try to create ancestors. */ if (parents && zfs_name_valid(argv[1], zfs_get_type(zhp)) && zfs_create_ancestors(g_zfs, argv[1]) != 0) { zfs_close(zhp); return (1); } ret = (zfs_rename(zhp, argv[1], flags) != 0); zfs_close(zhp); return (ret); } /* * zfs promote * * Promotes the given clone fs to be the parent */ static int zfs_do_promote(int argc, char **argv) { zfs_handle_t *zhp; int ret = 0; /* check options */ if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } /* check number of arguments */ if (argc < 2) { (void) fprintf(stderr, gettext("missing clone filesystem" " argument\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (1); ret = (zfs_promote(zhp) != 0); zfs_close(zhp); return (ret); } static int zfs_do_redact(int argc, char **argv) { char *snap = NULL; char *bookname = NULL; char **rsnaps = NULL; int numrsnaps = 0; argv++; argc--; if (argc < 3) { (void) fprintf(stderr, gettext("too few arguments\n")); usage(B_FALSE); } snap = argv[0]; bookname = argv[1]; rsnaps = argv + 2; numrsnaps = argc - 2; nvlist_t *rsnapnv = fnvlist_alloc(); for (int i = 0; i < numrsnaps; i++) { fnvlist_add_boolean(rsnapnv, rsnaps[i]); } int err = lzc_redact(snap, bookname, rsnapnv); fnvlist_free(rsnapnv); switch (err) { case 0: break; case ENOENT: (void) fprintf(stderr, gettext("provided snapshot %s does not exist\n"), snap); break; case EEXIST: (void) fprintf(stderr, gettext("specified redaction bookmark " "(%s) provided already exists\n"), bookname); break; case ENAMETOOLONG: (void) fprintf(stderr, gettext("provided bookmark name cannot " "be used, final name would be too long\n")); break; case E2BIG: (void) fprintf(stderr, gettext("too many redaction snapshots " "specified\n")); break; case EINVAL: if (strchr(bookname, '#') != NULL) (void) fprintf(stderr, gettext( "redaction bookmark name must not contain '#'\n")); else (void) fprintf(stderr, gettext( "redaction snapshot must be descendent of " "snapshot being redacted\n")); break; case EALREADY: (void) fprintf(stderr, gettext("attempted to redact redacted " "dataset or with respect to redacted dataset\n")); break; case ENOTSUP: (void) fprintf(stderr, gettext("redaction bookmarks feature " "not enabled\n")); break; case EXDEV: (void) fprintf(stderr, gettext("potentially invalid redaction " "snapshot; full dataset names required\n")); break; default: (void) fprintf(stderr, gettext("internal error: %s\n"), strerror(errno)); } return (err); } /* * zfs rollback [-rRf] * * -r Delete any intervening snapshots before doing rollback * -R Delete any snapshots and their clones * -f ignored for backwards compatibility * * Given a filesystem, rollback to a specific snapshot, discarding any changes * since then and making it the active dataset. If more recent snapshots exist, * the command will complain unless the '-r' flag is given. */ typedef struct rollback_cbdata { uint64_t cb_create; uint8_t cb_younger_ds_printed; boolean_t cb_first; int cb_doclones; char *cb_target; int cb_error; boolean_t cb_recurse; } rollback_cbdata_t; static int rollback_check_dependent(zfs_handle_t *zhp, void *data) { rollback_cbdata_t *cbp = data; if (cbp->cb_first && cbp->cb_recurse) { (void) fprintf(stderr, gettext("cannot rollback to " "'%s': clones of previous snapshots exist\n"), cbp->cb_target); (void) fprintf(stderr, gettext("use '-R' to " "force deletion of the following clones and " "dependents:\n")); cbp->cb_first = 0; cbp->cb_error = 1; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); zfs_close(zhp); return (0); } /* * Report some snapshots/bookmarks more recent than the one specified. * Used when '-r' is not specified. We reuse this same callback for the * snapshot dependents - if 'cb_dependent' is set, then this is a * dependent and we should report it without checking the transaction group. */ static int rollback_check(zfs_handle_t *zhp, void *data) { rollback_cbdata_t *cbp = data; /* * Max number of younger snapshots and/or bookmarks to display before * we stop the iteration. */ const uint8_t max_younger = 32; if (cbp->cb_doclones) { zfs_close(zhp); return (0); } if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { if (cbp->cb_first && !cbp->cb_recurse) { (void) fprintf(stderr, gettext("cannot " "rollback to '%s': more recent snapshots " "or bookmarks exist\n"), cbp->cb_target); (void) fprintf(stderr, gettext("use '-r' to " "force deletion of the following " "snapshots and bookmarks:\n")); cbp->cb_first = 0; cbp->cb_error = 1; } if (cbp->cb_recurse) { if (zfs_iter_dependents(zhp, B_TRUE, rollback_check_dependent, cbp) != 0) { zfs_close(zhp); return (-1); } } else { (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); cbp->cb_younger_ds_printed++; } } zfs_close(zhp); if (cbp->cb_younger_ds_printed == max_younger) { /* * This non-recursive rollback is going to fail due to the * presence of snapshots and/or bookmarks that are younger than * the rollback target. * We printed some of the offending objects, now we stop * zfs_iter_snapshot/bookmark iteration so we can fail fast and * avoid iterating over the rest of the younger objects */ (void) fprintf(stderr, gettext("Output limited to %d " "snapshots/bookmarks\n"), max_younger); return (-1); } return (0); } static int zfs_do_rollback(int argc, char **argv) { int ret = 0; int c; boolean_t force = B_FALSE; rollback_cbdata_t cb = { 0 }; zfs_handle_t *zhp, *snap; char parentname[ZFS_MAX_DATASET_NAME_LEN]; char *delim; uint64_t min_txg = 0; /* check options */ while ((c = getopt(argc, argv, "rRf")) != -1) { switch (c) { case 'r': cb.cb_recurse = 1; break; case 'R': cb.cb_recurse = 1; cb.cb_doclones = 1; break; case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* open the snapshot */ if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) return (1); /* open the parent dataset */ (void) strlcpy(parentname, argv[0], sizeof (parentname)); verify((delim = strrchr(parentname, '@')) != NULL); *delim = '\0'; if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_DATASET)) == NULL) { zfs_close(snap); return (1); } /* * Check for more recent snapshots and/or clones based on the presence * of '-r' and '-R'. */ cb.cb_target = argv[0]; cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); cb.cb_first = B_TRUE; cb.cb_error = 0; if (cb.cb_create > 0) min_txg = cb.cb_create; if ((ret = zfs_iter_snapshots(zhp, B_FALSE, rollback_check, &cb, min_txg, 0)) != 0) goto out; if ((ret = zfs_iter_bookmarks(zhp, rollback_check, &cb)) != 0) goto out; if ((ret = cb.cb_error) != 0) goto out; /* * Rollback parent to the given snapshot. */ ret = zfs_rollback(zhp, snap, force); out: zfs_close(snap); zfs_close(zhp); if (ret == 0) return (0); else return (1); } /* * zfs set property=value ... { fs | snap | vol } ... * * Sets the given properties for all datasets specified on the command line. */ static int set_callback(zfs_handle_t *zhp, void *data) { nvlist_t *props = data; if (zfs_prop_set_list(zhp, props) != 0) { switch (libzfs_errno(g_zfs)) { case EZFS_MOUNTFAILED: (void) fprintf(stderr, gettext("property may be set " "but unable to remount filesystem\n")); break; case EZFS_SHARENFSFAILED: (void) fprintf(stderr, gettext("property may be set " "but unable to reshare filesystem\n")); break; } return (1); } return (0); } static int zfs_do_set(int argc, char **argv) { nvlist_t *props = NULL; int ds_start = -1; /* argv idx of first dataset arg */ int ret = 0; int i; /* check for options */ if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } /* check number of arguments */ if (argc < 2) { (void) fprintf(stderr, gettext("missing arguments\n")); usage(B_FALSE); } if (argc < 3) { if (strchr(argv[1], '=') == NULL) { (void) fprintf(stderr, gettext("missing property=value " "argument(s)\n")); } else { (void) fprintf(stderr, gettext("missing dataset " "name(s)\n")); } usage(B_FALSE); } /* validate argument order: prop=val args followed by dataset args */ for (i = 1; i < argc; i++) { if (strchr(argv[i], '=') != NULL) { if (ds_start > 0) { /* out-of-order prop=val argument */ (void) fprintf(stderr, gettext("invalid " "argument order\n")); usage(B_FALSE); } } else if (ds_start < 0) { ds_start = i; } } if (ds_start < 0) { (void) fprintf(stderr, gettext("missing dataset name(s)\n")); usage(B_FALSE); } /* Populate a list of property settings */ if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); for (i = 1; i < ds_start; i++) { if (!parseprop(props, argv[i])) { ret = -1; goto error; } } ret = zfs_for_each(argc - ds_start, argv + ds_start, 0, ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, props); error: nvlist_free(props); return (ret); } typedef struct snap_cbdata { nvlist_t *sd_nvl; boolean_t sd_recursive; const char *sd_snapname; } snap_cbdata_t; static int zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) { snap_cbdata_t *sd = arg; char *name; int rv = 0; int error; if (sd->sd_recursive && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) { zfs_close(zhp); return (0); } error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname); if (error == -1) nomem(); fnvlist_add_boolean(sd->sd_nvl, name); free(name); if (sd->sd_recursive) rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); zfs_close(zhp); return (rv); } /* * zfs snapshot [-r] [-o prop=value] ... * * Creates a snapshot with the given name. While functionally equivalent to * 'zfs create', it is a separate command to differentiate intent. */ static int zfs_do_snapshot(int argc, char **argv) { int ret = 0; int c; nvlist_t *props; snap_cbdata_t sd = { 0 }; boolean_t multiple_snaps = B_FALSE; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, "ro:")) != -1) { switch (c) { case 'o': if (!parseprop(props, optarg)) { nvlist_free(sd.sd_nvl); nvlist_free(props); return (1); } break; case 'r': sd.sd_recursive = B_TRUE; multiple_snaps = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing snapshot argument\n")); goto usage; } if (argc > 1) multiple_snaps = B_TRUE; for (; argc > 0; argc--, argv++) { char *atp; zfs_handle_t *zhp; atp = strchr(argv[0], '@'); if (atp == NULL) goto usage; *atp = '\0'; sd.sd_snapname = atp + 1; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) goto usage; if (zfs_snapshot_cb(zhp, &sd) != 0) goto usage; } ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props); nvlist_free(sd.sd_nvl); nvlist_free(props); if (ret != 0 && multiple_snaps) (void) fprintf(stderr, gettext("no snapshots were created\n")); return (ret != 0); usage: nvlist_free(sd.sd_nvl); nvlist_free(props); usage(B_FALSE); return (-1); } /* * Array of prefixes to exclude – * a linear search, even if executed for each dataset, * is plenty good enough. */ typedef struct zfs_send_exclude_arg { size_t count; const char **list; } zfs_send_exclude_arg_t; static boolean_t zfs_do_send_exclude(zfs_handle_t *zhp, void *context) { zfs_send_exclude_arg_t *excludes = context; const char *name = zfs_get_name(zhp); for (size_t i = 0; i < excludes->count; ++i) { size_t len = strlen(excludes->list[i]); if (strncmp(name, excludes->list[i], len) == 0 && memchr("/@", name[len], sizeof ("/@"))) return (B_FALSE); } return (B_TRUE); } /* * Send a backup stream to stdout. */ static int zfs_do_send(int argc, char **argv) { char *fromname = NULL; char *toname = NULL; char *resume_token = NULL; char *cp; zfs_handle_t *zhp; sendflags_t flags = { 0 }; int c, err; nvlist_t *dbgnv = NULL; char *redactbook = NULL; zfs_send_exclude_arg_t excludes = { 0 }; struct option long_options[] = { {"replicate", no_argument, NULL, 'R'}, {"skip-missing", no_argument, NULL, 's'}, {"redact", required_argument, NULL, 'd'}, {"props", no_argument, NULL, 'p'}, {"parsable", no_argument, NULL, 'P'}, {"dedup", no_argument, NULL, 'D'}, {"verbose", no_argument, NULL, 'v'}, {"dryrun", no_argument, NULL, 'n'}, {"large-block", no_argument, NULL, 'L'}, {"embed", no_argument, NULL, 'e'}, {"resume", required_argument, NULL, 't'}, {"compressed", no_argument, NULL, 'c'}, {"raw", no_argument, NULL, 'w'}, {"backup", no_argument, NULL, 'b'}, {"holds", no_argument, NULL, 'h'}, {"saved", no_argument, NULL, 'S'}, {"exclude", required_argument, NULL, 'X'}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, ":i:I:RsDpvnPLeht:cwbd:SX:", long_options, NULL)) != -1) { switch (c) { case 'X': for (char *ds; (ds = strsep(&optarg, ",")) != NULL; ) { if (!zfs_name_valid(ds, ZFS_TYPE_DATASET) || strchr(ds, '/') == NULL) { (void) fprintf(stderr, gettext("-X %s: " "not a valid non-root dataset name" ".\n"), ds); usage(B_FALSE); } excludes.list = safe_realloc(excludes.list, sizeof (char *) * (excludes.count + 1)); excludes.list[excludes.count++] = ds; } break; case 'i': if (fromname) usage(B_FALSE); fromname = optarg; break; case 'I': if (fromname) usage(B_FALSE); fromname = optarg; flags.doall = B_TRUE; break; case 'R': flags.replicate = B_TRUE; break; case 's': flags.skipmissing = B_TRUE; break; case 'd': redactbook = optarg; break; case 'p': flags.props = B_TRUE; break; case 'b': flags.backup = B_TRUE; break; case 'h': flags.holds = B_TRUE; break; case 'P': flags.parsable = B_TRUE; break; case 'v': flags.verbosity++; flags.progress = B_TRUE; break; case 'D': (void) fprintf(stderr, gettext("WARNING: deduplicated send is no " "longer supported. A regular,\n" "non-deduplicated stream will be generated.\n\n")); break; case 'n': flags.dryrun = B_TRUE; break; case 'L': flags.largeblock = B_TRUE; break; case 'e': flags.embed_data = B_TRUE; break; case 't': resume_token = optarg; break; case 'c': flags.compress = B_TRUE; break; case 'w': flags.raw = B_TRUE; flags.compress = B_TRUE; flags.embed_data = B_TRUE; flags.largeblock = B_TRUE; break; case 'S': flags.saved = B_TRUE; break; case ':': /* * If a parameter was not passed, optopt contains the * value that would normally lead us into the * appropriate case statement. If it's > 256, then this * must be a longopt and we should look at argv to get * the string. Otherwise it's just the character, so we * should use it directly. */ if (optopt <= UINT8_MAX) { (void) fprintf(stderr, gettext("missing argument for '%c' " "option\n"), optopt); } else { (void) fprintf(stderr, gettext("missing argument for '%s' " "option\n"), argv[optind - 1]); } usage(B_FALSE); break; case '?': default: /* * If an invalid flag was passed, optopt contains the * character if it was a short flag, or 0 if it was a * longopt. */ if (optopt != 0) { (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } else { (void) fprintf(stderr, gettext("invalid option '%s'\n"), argv[optind - 1]); } usage(B_FALSE); } } if (flags.parsable && flags.verbosity == 0) flags.verbosity = 1; if (excludes.count > 0 && !flags.replicate) { (void) fprintf(stderr, gettext("Cannot specify " "dataset exclusion (-X) on a non-recursive " "send.\n")); return (1); } argc -= optind; argv += optind; if (resume_token != NULL) { if (fromname != NULL || flags.replicate || flags.props || flags.backup || flags.holds || flags.saved || redactbook != NULL) { (void) fprintf(stderr, gettext("invalid flags combined with -t\n")); usage(B_FALSE); } if (argc > 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } else { if (argc < 1) { (void) fprintf(stderr, gettext("missing snapshot argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } if (flags.saved) { if (fromname != NULL || flags.replicate || flags.props || flags.doall || flags.backup || flags.holds || flags.largeblock || flags.embed_data || flags.compress || flags.raw || redactbook != NULL) { (void) fprintf(stderr, gettext("incompatible flags " "combined with saved send flag\n")); usage(B_FALSE); } if (strchr(argv[0], '@') != NULL) { (void) fprintf(stderr, gettext("saved send must " "specify the dataset with partially-received " "state\n")); usage(B_FALSE); } } if (flags.raw && redactbook != NULL) { (void) fprintf(stderr, gettext("Error: raw sends may not be redacted.\n")); return (1); } if (!flags.dryrun && isatty(STDOUT_FILENO)) { (void) fprintf(stderr, gettext("Error: Stream can not be written to a terminal.\n" "You must redirect standard output.\n")); return (1); } if (flags.saved) { zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); if (zhp == NULL) return (1); err = zfs_send_saved(zhp, &flags, STDOUT_FILENO, resume_token); zfs_close(zhp); return (err != 0); } else if (resume_token != NULL) { return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO, resume_token)); } if (flags.skipmissing && !flags.replicate) { (void) fprintf(stderr, gettext("skip-missing flag can only be used in " "conjunction with replicate\n")); usage(B_FALSE); } /* * For everything except -R and -I, use the new, cleaner code path. */ if (!(flags.replicate || flags.doall)) { char frombuf[ZFS_MAX_DATASET_NAME_LEN]; if (fromname != NULL && (strchr(fromname, '#') == NULL && strchr(fromname, '@') == NULL)) { /* * Neither bookmark or snapshot was specified. Print a * warning, and assume snapshot. */ (void) fprintf(stderr, "Warning: incremental source " "didn't specify type, assuming snapshot. Use '@' " "or '#' prefix to avoid ambiguity.\n"); (void) snprintf(frombuf, sizeof (frombuf), "@%s", fromname); fromname = frombuf; } if (fromname != NULL && (fromname[0] == '#' || fromname[0] == '@')) { /* * Incremental source name begins with # or @. * Default to same fs as target. */ char tmpbuf[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(tmpbuf, fromname, sizeof (tmpbuf)); (void) strlcpy(frombuf, argv[0], sizeof (frombuf)); cp = strchr(frombuf, '@'); if (cp != NULL) *cp = '\0'; (void) strlcat(frombuf, tmpbuf, sizeof (frombuf)); fromname = frombuf; } zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); if (zhp == NULL) return (1); err = zfs_send_one(zhp, fromname, STDOUT_FILENO, &flags, redactbook); zfs_close(zhp); return (err != 0); } if (fromname != NULL && strchr(fromname, '#')) { (void) fprintf(stderr, gettext("Error: multiple snapshots cannot be " "sent from a bookmark.\n")); return (1); } if (redactbook != NULL) { (void) fprintf(stderr, gettext("Error: multiple snapshots " "cannot be sent redacted.\n")); return (1); } if ((cp = strchr(argv[0], '@')) == NULL) { (void) fprintf(stderr, gettext("Error: " "Unsupported flag with filesystem or bookmark.\n")); return (1); } *cp = '\0'; toname = cp + 1; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (1); /* * If they specified the full path to the snapshot, chop off * everything except the short name of the snapshot, but special * case if they specify the origin. */ if (fromname && (cp = strchr(fromname, '@')) != NULL) { char origin[ZFS_MAX_DATASET_NAME_LEN]; zprop_source_t src; (void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN, origin, sizeof (origin), &src, NULL, 0, B_FALSE); if (strcmp(origin, fromname) == 0) { fromname = NULL; flags.fromorigin = B_TRUE; } else { *cp = '\0'; if (cp != fromname && strcmp(argv[0], fromname)) { (void) fprintf(stderr, gettext("incremental source must be " "in same filesystem\n")); usage(B_FALSE); } fromname = cp + 1; if (strchr(fromname, '@') || strchr(fromname, '/')) { (void) fprintf(stderr, gettext("invalid incremental source\n")); usage(B_FALSE); } } } if (flags.replicate && fromname == NULL) flags.doall = B_TRUE; err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, excludes.count > 0 ? zfs_do_send_exclude : NULL, &excludes, flags.verbosity >= 3 ? &dbgnv : NULL); if (flags.verbosity >= 3 && dbgnv != NULL) { /* * dump_nvlist prints to stdout, but that's been * redirected to a file. Make it print to stderr * instead. */ (void) dup2(STDERR_FILENO, STDOUT_FILENO); dump_nvlist(dbgnv, 0); nvlist_free(dbgnv); } zfs_close(zhp); free(excludes.list); return (err != 0); } /* * Restore a backup stream from stdin. */ static int zfs_do_receive(int argc, char **argv) { int c, err = 0; recvflags_t flags = { 0 }; boolean_t abort_resumable = B_FALSE; nvlist_t *props; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, ":o:x:dehMnuvFsAc")) != -1) { switch (c) { case 'o': if (!parseprop(props, optarg)) { nvlist_free(props); usage(B_FALSE); } break; case 'x': if (!parsepropname(props, optarg)) { nvlist_free(props); usage(B_FALSE); } break; case 'd': if (flags.istail) { (void) fprintf(stderr, gettext("invalid option " "combination: -d and -e are mutually " "exclusive\n")); usage(B_FALSE); } flags.isprefix = B_TRUE; break; case 'e': if (flags.isprefix) { (void) fprintf(stderr, gettext("invalid option " "combination: -d and -e are mutually " "exclusive\n")); usage(B_FALSE); } flags.istail = B_TRUE; break; case 'h': flags.skipholds = B_TRUE; break; case 'M': flags.forceunmount = B_TRUE; break; case 'n': flags.dryrun = B_TRUE; break; case 'u': flags.nomount = B_TRUE; break; case 'v': flags.verbose = B_TRUE; break; case 's': flags.resumable = B_TRUE; break; case 'F': flags.force = B_TRUE; break; case 'A': abort_resumable = B_TRUE; break; case 'c': flags.heal = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* zfs recv -e (use "tail" name) implies -d (remove dataset "head") */ if (flags.istail) flags.isprefix = B_TRUE; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing snapshot argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (abort_resumable) { if (flags.isprefix || flags.istail || flags.dryrun || flags.resumable || flags.nomount) { (void) fprintf(stderr, gettext("invalid option\n")); usage(B_FALSE); } char namebuf[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(namebuf, sizeof (namebuf), "%s/%%recv", argv[0]); if (zfs_dataset_exists(g_zfs, namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { zfs_handle_t *zhp = zfs_open(g_zfs, namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { nvlist_free(props); return (1); } err = zfs_destroy(zhp, B_FALSE); zfs_close(zhp); } else { zfs_handle_t *zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) usage(B_FALSE); if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) || zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0, NULL, NULL, 0, B_TRUE) == -1) { (void) fprintf(stderr, gettext("'%s' does not have any " "resumable receive state to abort\n"), argv[0]); nvlist_free(props); zfs_close(zhp); return (1); } err = zfs_destroy(zhp, B_FALSE); zfs_close(zhp); } nvlist_free(props); return (err != 0); } if (isatty(STDIN_FILENO)) { (void) fprintf(stderr, gettext("Error: Backup stream can not be read " "from a terminal.\n" "You must redirect standard input.\n")); nvlist_free(props); return (1); } err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL); nvlist_free(props); return (err != 0); } /* * allow/unallow stuff */ /* copied from zfs/sys/dsl_deleg.h */ #define ZFS_DELEG_PERM_CREATE "create" #define ZFS_DELEG_PERM_DESTROY "destroy" #define ZFS_DELEG_PERM_SNAPSHOT "snapshot" #define ZFS_DELEG_PERM_ROLLBACK "rollback" #define ZFS_DELEG_PERM_CLONE "clone" #define ZFS_DELEG_PERM_PROMOTE "promote" #define ZFS_DELEG_PERM_RENAME "rename" #define ZFS_DELEG_PERM_MOUNT "mount" #define ZFS_DELEG_PERM_SHARE "share" #define ZFS_DELEG_PERM_SEND "send" #define ZFS_DELEG_PERM_RECEIVE "receive" #define ZFS_DELEG_PERM_ALLOW "allow" #define ZFS_DELEG_PERM_USERPROP "userprop" #define ZFS_DELEG_PERM_VSCAN "vscan" /* ??? */ #define ZFS_DELEG_PERM_USERQUOTA "userquota" #define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" #define ZFS_DELEG_PERM_USERUSED "userused" #define ZFS_DELEG_PERM_GROUPUSED "groupused" #define ZFS_DELEG_PERM_USEROBJQUOTA "userobjquota" #define ZFS_DELEG_PERM_GROUPOBJQUOTA "groupobjquota" #define ZFS_DELEG_PERM_USEROBJUSED "userobjused" #define ZFS_DELEG_PERM_GROUPOBJUSED "groupobjused" #define ZFS_DELEG_PERM_HOLD "hold" #define ZFS_DELEG_PERM_RELEASE "release" #define ZFS_DELEG_PERM_DIFF "diff" #define ZFS_DELEG_PERM_BOOKMARK "bookmark" #define ZFS_DELEG_PERM_LOAD_KEY "load-key" #define ZFS_DELEG_PERM_CHANGE_KEY "change-key" #define ZFS_DELEG_PERM_PROJECTUSED "projectused" #define ZFS_DELEG_PERM_PROJECTQUOTA "projectquota" #define ZFS_DELEG_PERM_PROJECTOBJUSED "projectobjused" #define ZFS_DELEG_PERM_PROJECTOBJQUOTA "projectobjquota" #define ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = { { ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW }, { ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE }, { ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE }, { ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY }, { ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF}, { ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD }, { ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT }, { ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE }, { ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE }, { ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE }, { ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME }, { ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK }, { ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND }, { ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, { ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, { ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK }, { ZFS_DELEG_PERM_LOAD_KEY, ZFS_DELEG_NOTE_LOAD_KEY }, { ZFS_DELEG_PERM_CHANGE_KEY, ZFS_DELEG_NOTE_CHANGE_KEY }, { ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA }, { ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED }, { ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP }, { ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA }, { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED }, { ZFS_DELEG_PERM_USEROBJQUOTA, ZFS_DELEG_NOTE_USEROBJQUOTA }, { ZFS_DELEG_PERM_USEROBJUSED, ZFS_DELEG_NOTE_USEROBJUSED }, { ZFS_DELEG_PERM_GROUPOBJQUOTA, ZFS_DELEG_NOTE_GROUPOBJQUOTA }, { ZFS_DELEG_PERM_GROUPOBJUSED, ZFS_DELEG_NOTE_GROUPOBJUSED }, { ZFS_DELEG_PERM_PROJECTUSED, ZFS_DELEG_NOTE_PROJECTUSED }, { ZFS_DELEG_PERM_PROJECTQUOTA, ZFS_DELEG_NOTE_PROJECTQUOTA }, { ZFS_DELEG_PERM_PROJECTOBJUSED, ZFS_DELEG_NOTE_PROJECTOBJUSED }, { ZFS_DELEG_PERM_PROJECTOBJQUOTA, ZFS_DELEG_NOTE_PROJECTOBJQUOTA }, { NULL, ZFS_DELEG_NOTE_NONE } }; /* permission structure */ typedef struct deleg_perm { zfs_deleg_who_type_t dp_who_type; const char *dp_name; boolean_t dp_local; boolean_t dp_descend; } deleg_perm_t; /* */ typedef struct deleg_perm_node { deleg_perm_t dpn_perm; uu_avl_node_t dpn_avl_node; } deleg_perm_node_t; typedef struct fs_perm fs_perm_t; /* permissions set */ typedef struct who_perm { zfs_deleg_who_type_t who_type; const char *who_name; /* id */ char who_ug_name[256]; /* user/group name */ fs_perm_t *who_fsperm; /* uplink */ uu_avl_t *who_deleg_perm_avl; /* permissions */ } who_perm_t; /* */ typedef struct who_perm_node { who_perm_t who_perm; uu_avl_node_t who_avl_node; } who_perm_node_t; typedef struct fs_perm_set fs_perm_set_t; /* fs permissions */ struct fs_perm { const char *fsp_name; uu_avl_t *fsp_sc_avl; /* sets,create */ uu_avl_t *fsp_uge_avl; /* user,group,everyone */ fs_perm_set_t *fsp_set; /* uplink */ }; /* */ typedef struct fs_perm_node { fs_perm_t fspn_fsperm; uu_avl_t *fspn_avl; uu_list_node_t fspn_list_node; } fs_perm_node_t; /* top level structure */ struct fs_perm_set { uu_list_pool_t *fsps_list_pool; uu_list_t *fsps_list; /* list of fs_perms */ uu_avl_pool_t *fsps_named_set_avl_pool; uu_avl_pool_t *fsps_who_perm_avl_pool; uu_avl_pool_t *fsps_deleg_perm_avl_pool; }; static inline const char * deleg_perm_type(zfs_deleg_note_t note) { /* subcommands */ switch (note) { /* SUBCOMMANDS */ /* OTHER */ case ZFS_DELEG_NOTE_GROUPQUOTA: case ZFS_DELEG_NOTE_GROUPUSED: case ZFS_DELEG_NOTE_USERPROP: case ZFS_DELEG_NOTE_USERQUOTA: case ZFS_DELEG_NOTE_USERUSED: case ZFS_DELEG_NOTE_USEROBJQUOTA: case ZFS_DELEG_NOTE_USEROBJUSED: case ZFS_DELEG_NOTE_GROUPOBJQUOTA: case ZFS_DELEG_NOTE_GROUPOBJUSED: case ZFS_DELEG_NOTE_PROJECTUSED: case ZFS_DELEG_NOTE_PROJECTQUOTA: case ZFS_DELEG_NOTE_PROJECTOBJUSED: case ZFS_DELEG_NOTE_PROJECTOBJQUOTA: /* other */ return (gettext("other")); default: return (gettext("subcommand")); } } static int who_type2weight(zfs_deleg_who_type_t who_type) { int res; switch (who_type) { case ZFS_DELEG_NAMED_SET_SETS: case ZFS_DELEG_NAMED_SET: res = 0; break; case ZFS_DELEG_CREATE_SETS: case ZFS_DELEG_CREATE: res = 1; break; case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: res = 2; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: res = 3; break; case ZFS_DELEG_EVERYONE_SETS: case ZFS_DELEG_EVERYONE: res = 4; break; default: res = -1; } return (res); } static int who_perm_compare(const void *larg, const void *rarg, void *unused) { (void) unused; const who_perm_node_t *l = larg; const who_perm_node_t *r = rarg; zfs_deleg_who_type_t ltype = l->who_perm.who_type; zfs_deleg_who_type_t rtype = r->who_perm.who_type; int lweight = who_type2weight(ltype); int rweight = who_type2weight(rtype); int res = lweight - rweight; if (res == 0) res = strncmp(l->who_perm.who_name, r->who_perm.who_name, ZFS_MAX_DELEG_NAME-1); if (res == 0) return (0); if (res > 0) return (1); else return (-1); } static int deleg_perm_compare(const void *larg, const void *rarg, void *unused) { (void) unused; const deleg_perm_node_t *l = larg; const deleg_perm_node_t *r = rarg; int res = strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name, ZFS_MAX_DELEG_NAME-1); if (res == 0) return (0); if (res > 0) return (1); else return (-1); } static inline void fs_perm_set_init(fs_perm_set_t *fspset) { memset(fspset, 0, sizeof (fs_perm_set_t)); if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool", sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node), NULL, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create( "named_set_avl_pool", sizeof (who_perm_node_t), offsetof( who_perm_node_t, who_avl_node), who_perm_compare, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create( "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof( who_perm_node_t, who_avl_node), who_perm_compare, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create( "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof( deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT)) == NULL) nomem(); } static inline void fs_perm_fini(fs_perm_t *); static inline void who_perm_fini(who_perm_t *); static inline void fs_perm_set_fini(fs_perm_set_t *fspset) { fs_perm_node_t *node = uu_list_first(fspset->fsps_list); while (node != NULL) { fs_perm_node_t *next_node = uu_list_next(fspset->fsps_list, node); fs_perm_t *fsperm = &node->fspn_fsperm; fs_perm_fini(fsperm); uu_list_remove(fspset->fsps_list, node); free(node); node = next_node; } uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool); uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool); uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool); } static inline void deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type, const char *name) { deleg_perm->dp_who_type = type; deleg_perm->dp_name = name; } static inline void who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm, zfs_deleg_who_type_t type, const char *name) { uu_avl_pool_t *pool; pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool; memset(who_perm, 0, sizeof (who_perm_t)); if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL) nomem(); who_perm->who_type = type; who_perm->who_name = name; who_perm->who_fsperm = fsperm; } static inline void who_perm_fini(who_perm_t *who_perm) { deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl); while (node != NULL) { deleg_perm_node_t *next_node = uu_avl_next(who_perm->who_deleg_perm_avl, node); uu_avl_remove(who_perm->who_deleg_perm_avl, node); free(node); node = next_node; } uu_avl_destroy(who_perm->who_deleg_perm_avl); } static inline void fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname) { uu_avl_pool_t *nset_pool = fspset->fsps_named_set_avl_pool; uu_avl_pool_t *who_pool = fspset->fsps_who_perm_avl_pool; memset(fsperm, 0, sizeof (fs_perm_t)); if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT)) == NULL) nomem(); if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT)) == NULL) nomem(); fsperm->fsp_set = fspset; fsperm->fsp_name = fsname; } static inline void fs_perm_fini(fs_perm_t *fsperm) { who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl); while (node != NULL) { who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl, node); who_perm_t *who_perm = &node->who_perm; who_perm_fini(who_perm); uu_avl_remove(fsperm->fsp_sc_avl, node); free(node); node = next_node; } node = uu_avl_first(fsperm->fsp_uge_avl); while (node != NULL) { who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl, node); who_perm_t *who_perm = &node->who_perm; who_perm_fini(who_perm); uu_avl_remove(fsperm->fsp_uge_avl, node); free(node); node = next_node; } uu_avl_destroy(fsperm->fsp_sc_avl); uu_avl_destroy(fsperm->fsp_uge_avl); } static void set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node, zfs_deleg_who_type_t who_type, const char *name, char locality) { uu_avl_index_t idx = 0; deleg_perm_node_t *found_node = NULL; deleg_perm_t *deleg_perm = &node->dpn_perm; deleg_perm_init(deleg_perm, who_type, name); if ((found_node = uu_avl_find(avl, node, NULL, &idx)) == NULL) uu_avl_insert(avl, node, idx); else { node = found_node; deleg_perm = &node->dpn_perm; } switch (locality) { case ZFS_DELEG_LOCAL: deleg_perm->dp_local = B_TRUE; break; case ZFS_DELEG_DESCENDENT: deleg_perm->dp_descend = B_TRUE; break; case ZFS_DELEG_NA: break; default: assert(B_FALSE); /* invalid locality */ } } static inline int parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality) { nvpair_t *nvp = NULL; fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set; uu_avl_t *avl = who_perm->who_deleg_perm_avl; zfs_deleg_who_type_t who_type = who_perm->who_type; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { const char *name = nvpair_name(nvp); data_type_t type = nvpair_type(nvp); uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool; deleg_perm_node_t *node = safe_malloc(sizeof (deleg_perm_node_t)); VERIFY(type == DATA_TYPE_BOOLEAN); uu_avl_node_init(node, &node->dpn_avl_node, avl_pool); set_deleg_perm_node(avl, node, who_type, name, locality); } return (0); } static inline int parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl) { nvpair_t *nvp = NULL; fs_perm_set_t *fspset = fsperm->fsp_set; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { nvlist_t *nvl2 = NULL; const char *name = nvpair_name(nvp); uu_avl_t *avl = NULL; uu_avl_pool_t *avl_pool = NULL; zfs_deleg_who_type_t perm_type = name[0]; char perm_locality = name[1]; const char *perm_name = name + 3; who_perm_t *who_perm = NULL; assert('$' == name[2]); if (nvpair_value_nvlist(nvp, &nvl2) != 0) return (-1); switch (perm_type) { case ZFS_DELEG_CREATE: case ZFS_DELEG_CREATE_SETS: case ZFS_DELEG_NAMED_SET: case ZFS_DELEG_NAMED_SET_SETS: avl_pool = fspset->fsps_named_set_avl_pool; avl = fsperm->fsp_sc_avl; break; case ZFS_DELEG_USER: case ZFS_DELEG_USER_SETS: case ZFS_DELEG_GROUP: case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_EVERYONE: case ZFS_DELEG_EVERYONE_SETS: avl_pool = fspset->fsps_who_perm_avl_pool; avl = fsperm->fsp_uge_avl; break; default: assert(!"unhandled zfs_deleg_who_type_t"); } who_perm_node_t *found_node = NULL; who_perm_node_t *node = safe_malloc( sizeof (who_perm_node_t)); who_perm = &node->who_perm; uu_avl_index_t idx = 0; uu_avl_node_init(node, &node->who_avl_node, avl_pool); who_perm_init(who_perm, fsperm, perm_type, perm_name); if ((found_node = uu_avl_find(avl, node, NULL, &idx)) == NULL) { if (avl == fsperm->fsp_uge_avl) { uid_t rid = 0; struct passwd *p = NULL; struct group *g = NULL; const char *nice_name = NULL; switch (perm_type) { case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: rid = atoi(perm_name); p = getpwuid(rid); if (p) nice_name = p->pw_name; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: rid = atoi(perm_name); g = getgrgid(rid); if (g) nice_name = g->gr_name; break; default: break; } if (nice_name != NULL) { (void) strlcpy( node->who_perm.who_ug_name, nice_name, 256); } else { /* User or group unknown */ (void) snprintf( node->who_perm.who_ug_name, sizeof (node->who_perm.who_ug_name), "(unknown: %d)", rid); } } uu_avl_insert(avl, node, idx); } else { node = found_node; who_perm = &node->who_perm; } assert(who_perm != NULL); (void) parse_who_perm(who_perm, nvl2, perm_locality); } return (0); } static inline int parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl) { nvpair_t *nvp = NULL; uu_avl_index_t idx = 0; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { nvlist_t *nvl2 = NULL; const char *fsname = nvpair_name(nvp); data_type_t type = nvpair_type(nvp); fs_perm_t *fsperm = NULL; fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t)); fsperm = &node->fspn_fsperm; VERIFY(DATA_TYPE_NVLIST == type); uu_list_node_init(node, &node->fspn_list_node, fspset->fsps_list_pool); idx = uu_list_numnodes(fspset->fsps_list); fs_perm_init(fsperm, fspset, fsname); if (nvpair_value_nvlist(nvp, &nvl2) != 0) return (-1); (void) parse_fs_perm(fsperm, nvl2); uu_list_insert(fspset->fsps_list, node, idx); } return (0); } static inline const char * deleg_perm_comment(zfs_deleg_note_t note) { const char *str = ""; /* subcommands */ switch (note) { /* SUBCOMMANDS */ case ZFS_DELEG_NOTE_ALLOW: str = gettext("Must also have the permission that is being" "\n\t\t\t\tallowed"); break; case ZFS_DELEG_NOTE_CLONE: str = gettext("Must also have the 'create' ability and 'mount'" "\n\t\t\t\tability in the origin file system"); break; case ZFS_DELEG_NOTE_CREATE: str = gettext("Must also have the 'mount' ability"); break; case ZFS_DELEG_NOTE_DESTROY: str = gettext("Must also have the 'mount' ability"); break; case ZFS_DELEG_NOTE_DIFF: str = gettext("Allows lookup of paths within a dataset;" "\n\t\t\t\tgiven an object number. Ordinary users need this" "\n\t\t\t\tin order to use zfs diff"); break; case ZFS_DELEG_NOTE_HOLD: str = gettext("Allows adding a user hold to a snapshot"); break; case ZFS_DELEG_NOTE_MOUNT: str = gettext("Allows mount/umount of ZFS datasets"); break; case ZFS_DELEG_NOTE_PROMOTE: str = gettext("Must also have the 'mount'\n\t\t\t\tand" " 'promote' ability in the origin file system"); break; case ZFS_DELEG_NOTE_RECEIVE: str = gettext("Must also have the 'mount' and 'create'" " ability"); break; case ZFS_DELEG_NOTE_RELEASE: str = gettext("Allows releasing a user hold which\n\t\t\t\t" "might destroy the snapshot"); break; case ZFS_DELEG_NOTE_RENAME: str = gettext("Must also have the 'mount' and 'create'" "\n\t\t\t\tability in the new parent"); break; case ZFS_DELEG_NOTE_ROLLBACK: str = gettext(""); break; case ZFS_DELEG_NOTE_SEND: str = gettext(""); break; case ZFS_DELEG_NOTE_SHARE: str = gettext("Allows sharing file systems over NFS or SMB" "\n\t\t\t\tprotocols"); break; case ZFS_DELEG_NOTE_SNAPSHOT: str = gettext(""); break; case ZFS_DELEG_NOTE_LOAD_KEY: str = gettext("Allows loading or unloading an encryption key"); break; case ZFS_DELEG_NOTE_CHANGE_KEY: str = gettext("Allows changing or adding an encryption key"); break; /* * case ZFS_DELEG_NOTE_VSCAN: * str = gettext(""); * break; */ /* OTHER */ case ZFS_DELEG_NOTE_GROUPQUOTA: str = gettext("Allows accessing any groupquota@... property"); break; case ZFS_DELEG_NOTE_GROUPUSED: str = gettext("Allows reading any groupused@... property"); break; case ZFS_DELEG_NOTE_USERPROP: str = gettext("Allows changing any user property"); break; case ZFS_DELEG_NOTE_USERQUOTA: str = gettext("Allows accessing any userquota@... property"); break; case ZFS_DELEG_NOTE_USERUSED: str = gettext("Allows reading any userused@... property"); break; case ZFS_DELEG_NOTE_USEROBJQUOTA: str = gettext("Allows accessing any userobjquota@... property"); break; case ZFS_DELEG_NOTE_GROUPOBJQUOTA: str = gettext("Allows accessing any \n\t\t\t\t" "groupobjquota@... property"); break; case ZFS_DELEG_NOTE_GROUPOBJUSED: str = gettext("Allows reading any groupobjused@... property"); break; case ZFS_DELEG_NOTE_USEROBJUSED: str = gettext("Allows reading any userobjused@... property"); break; case ZFS_DELEG_NOTE_PROJECTQUOTA: str = gettext("Allows accessing any projectquota@... property"); break; case ZFS_DELEG_NOTE_PROJECTOBJQUOTA: str = gettext("Allows accessing any \n\t\t\t\t" "projectobjquota@... property"); break; case ZFS_DELEG_NOTE_PROJECTUSED: str = gettext("Allows reading any projectused@... property"); break; case ZFS_DELEG_NOTE_PROJECTOBJUSED: str = gettext("Allows accessing any \n\t\t\t\t" "projectobjused@... property"); break; /* other */ default: str = ""; } return (str); } struct allow_opts { boolean_t local; boolean_t descend; boolean_t user; boolean_t group; boolean_t everyone; boolean_t create; boolean_t set; boolean_t recursive; /* unallow only */ boolean_t prt_usage; boolean_t prt_perms; char *who; char *perms; const char *dataset; }; static inline int prop_cmp(const void *a, const void *b) { const char *str1 = *(const char **)a; const char *str2 = *(const char **)b; return (strcmp(str1, str2)); } static void allow_usage(boolean_t un, boolean_t requested, const char *msg) { const char *opt_desc[] = { "-h", gettext("show this help message and exit"), "-l", gettext("set permission locally"), "-d", gettext("set permission for descents"), "-u", gettext("set permission for user"), "-g", gettext("set permission for group"), "-e", gettext("set permission for everyone"), "-c", gettext("set create time permission"), "-s", gettext("define permission set"), /* unallow only */ "-r", gettext("remove permissions recursively"), }; size_t unallow_size = sizeof (opt_desc) / sizeof (char *); size_t allow_size = unallow_size - 2; const char *props[ZFS_NUM_PROPS]; int i; size_t count = 0; FILE *fp = requested ? stdout : stderr; zprop_desc_t *pdtbl = zfs_prop_get_table(); const char *fmt = gettext("%-16s %-14s\t%s\n"); (void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW : HELP_ALLOW)); (void) fprintf(fp, gettext("Options:\n")); for (i = 0; i < (un ? unallow_size : allow_size); i += 2) { const char *opt = opt_desc[i]; const char *optdsc = opt_desc[i + 1]; (void) fprintf(fp, gettext(" %-10s %s\n"), opt, optdsc); } (void) fprintf(fp, gettext("\nThe following permissions are " "supported:\n\n")); (void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"), gettext("NOTES")); for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) { const char *perm_name = zfs_deleg_perm_tbl[i].z_perm; zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note; const char *perm_type = deleg_perm_type(perm_note); const char *perm_comment = deleg_perm_comment(perm_note); (void) fprintf(fp, fmt, perm_name, perm_type, perm_comment); } for (i = 0; i < ZFS_NUM_PROPS; i++) { zprop_desc_t *pd = &pdtbl[i]; if (pd->pd_visible != B_TRUE) continue; if (pd->pd_attr == PROP_READONLY) continue; props[count++] = pd->pd_name; } props[count] = NULL; qsort(props, count, sizeof (char *), prop_cmp); for (i = 0; i < count; i++) (void) fprintf(fp, fmt, props[i], gettext("property"), ""); if (msg != NULL) (void) fprintf(fp, gettext("\nzfs: error: %s"), msg); exit(requested ? 0 : 2); } static inline const char * munge_args(int argc, char **argv, boolean_t un, size_t expected_argc, char **permsp) { if (un && argc == expected_argc - 1) *permsp = NULL; else if (argc == expected_argc) *permsp = argv[argc - 2]; else allow_usage(un, B_FALSE, gettext("wrong number of parameters\n")); return (argv[argc - 1]); } static void parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts) { int uge_sum = opts->user + opts->group + opts->everyone; int csuge_sum = opts->create + opts->set + uge_sum; int ldcsuge_sum = csuge_sum + opts->local + opts->descend; int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum; if (uge_sum > 1) allow_usage(un, B_FALSE, gettext("-u, -g, and -e are mutually exclusive\n")); if (opts->prt_usage) { if (argc == 0 && all_sum == 0) allow_usage(un, B_TRUE, NULL); else usage(B_FALSE); } if (opts->set) { if (csuge_sum > 1) allow_usage(un, B_FALSE, gettext("invalid options combined with -s\n")); opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); if (argv[0][0] != '@') allow_usage(un, B_FALSE, gettext("invalid set name: missing '@' prefix\n")); opts->who = argv[0]; } else if (opts->create) { if (ldcsuge_sum > 1) allow_usage(un, B_FALSE, gettext("invalid options combined with -c\n")); opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); } else if (opts->everyone) { if (csuge_sum > 1) allow_usage(un, B_FALSE, gettext("invalid options combined with -e\n")); opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); } else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone") == 0) { opts->everyone = B_TRUE; argc--; argv++; opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); } else if (argc == 1 && !un) { opts->prt_perms = B_TRUE; opts->dataset = argv[argc-1]; } else { opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); opts->who = argv[0]; } if (!opts->local && !opts->descend) { opts->local = B_TRUE; opts->descend = B_TRUE; } } static void store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend, const char *who, char *perms, nvlist_t *top_nvl) { int i; char ld[2] = { '\0', '\0' }; char who_buf[MAXNAMELEN + 32]; char base_type = '\0'; char set_type = '\0'; nvlist_t *base_nvl = NULL; nvlist_t *set_nvl = NULL; nvlist_t *nvl; if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); switch (type) { case ZFS_DELEG_NAMED_SET_SETS: case ZFS_DELEG_NAMED_SET: set_type = ZFS_DELEG_NAMED_SET_SETS; base_type = ZFS_DELEG_NAMED_SET; ld[0] = ZFS_DELEG_NA; break; case ZFS_DELEG_CREATE_SETS: case ZFS_DELEG_CREATE: set_type = ZFS_DELEG_CREATE_SETS; base_type = ZFS_DELEG_CREATE; ld[0] = ZFS_DELEG_NA; break; case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: set_type = ZFS_DELEG_USER_SETS; base_type = ZFS_DELEG_USER; if (local) ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: set_type = ZFS_DELEG_GROUP_SETS; base_type = ZFS_DELEG_GROUP; if (local) ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; break; case ZFS_DELEG_EVERYONE_SETS: case ZFS_DELEG_EVERYONE: set_type = ZFS_DELEG_EVERYONE_SETS; base_type = ZFS_DELEG_EVERYONE; if (local) ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; break; default: assert(set_type != '\0' && base_type != '\0'); } if (perms != NULL) { char *curr = perms; char *end = curr + strlen(perms); while (curr < end) { char *delim = strchr(curr, ','); if (delim == NULL) delim = end; else *delim = '\0'; if (curr[0] == '@') nvl = set_nvl; else nvl = base_nvl; (void) nvlist_add_boolean(nvl, curr); if (delim != end) *delim = ','; curr = delim + 1; } for (i = 0; i < 2; i++) { char locality = ld[i]; if (locality == 0) continue; if (!nvlist_empty(base_nvl)) { if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", base_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", base_type, locality); (void) nvlist_add_nvlist(top_nvl, who_buf, base_nvl); } if (!nvlist_empty(set_nvl)) { if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", set_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", set_type, locality); (void) nvlist_add_nvlist(top_nvl, who_buf, set_nvl); } } } else { for (i = 0; i < 2; i++) { char locality = ld[i]; if (locality == 0) continue; if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", base_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", base_type, locality); (void) nvlist_add_boolean(top_nvl, who_buf); if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", set_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", set_type, locality); (void) nvlist_add_boolean(top_nvl, who_buf); } } } static int construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) { if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0) nomem(); if (opts->set) { store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local, opts->descend, opts->who, opts->perms, *nvlp); } else if (opts->create) { store_allow_perm(ZFS_DELEG_CREATE, opts->local, opts->descend, NULL, opts->perms, *nvlp); } else if (opts->everyone) { store_allow_perm(ZFS_DELEG_EVERYONE, opts->local, opts->descend, NULL, opts->perms, *nvlp); } else { char *curr = opts->who; char *end = curr + strlen(curr); while (curr < end) { const char *who; zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN; char *endch; char *delim = strchr(curr, ','); char errbuf[256]; char id[64]; struct passwd *p = NULL; struct group *g = NULL; uid_t rid; if (delim == NULL) delim = end; else *delim = '\0'; rid = (uid_t)strtol(curr, &endch, 0); if (opts->user) { who_type = ZFS_DELEG_USER; if (*endch != '\0') p = getpwnam(curr); else p = getpwuid(rid); if (p != NULL) rid = p->pw_uid; else if (*endch != '\0') { (void) snprintf(errbuf, 256, gettext( "invalid user %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } else if (opts->group) { who_type = ZFS_DELEG_GROUP; if (*endch != '\0') g = getgrnam(curr); else g = getgrgid(rid); if (g != NULL) rid = g->gr_gid; else if (*endch != '\0') { (void) snprintf(errbuf, 256, gettext( "invalid group %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } else { if (*endch != '\0') { p = getpwnam(curr); } else { p = getpwuid(rid); } if (p == NULL) { if (*endch != '\0') { g = getgrnam(curr); } else { g = getgrgid(rid); } } if (p != NULL) { who_type = ZFS_DELEG_USER; rid = p->pw_uid; } else if (g != NULL) { who_type = ZFS_DELEG_GROUP; rid = g->gr_gid; } else { (void) snprintf(errbuf, 256, gettext( "invalid user/group %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } (void) sprintf(id, "%u", rid); who = id; store_allow_perm(who_type, opts->local, opts->descend, who, opts->perms, *nvlp); curr = delim + 1; } } return (0); } static void print_set_creat_perms(uu_avl_t *who_avl) { const char *sc_title[] = { gettext("Permission sets:\n"), gettext("Create time permissions:\n"), NULL }; who_perm_node_t *who_node = NULL; int prev_weight = -1; for (who_node = uu_avl_first(who_avl); who_node != NULL; who_node = uu_avl_next(who_avl, who_node)) { uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; const char *who_name = who_node->who_perm.who_name; int weight = who_type2weight(who_type); boolean_t first = B_TRUE; deleg_perm_node_t *deleg_node; if (prev_weight != weight) { (void) printf("%s", sc_title[weight]); prev_weight = weight; } if (who_name == NULL || strnlen(who_name, 1) == 0) (void) printf("\t"); else (void) printf("\t%s ", who_name); for (deleg_node = uu_avl_first(avl); deleg_node != NULL; deleg_node = uu_avl_next(avl, deleg_node)) { if (first) { (void) printf("%s", deleg_node->dpn_perm.dp_name); first = B_FALSE; } else (void) printf(",%s", deleg_node->dpn_perm.dp_name); } (void) printf("\n"); } } static void print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend, const char *title) { who_perm_node_t *who_node = NULL; boolean_t prt_title = B_TRUE; uu_avl_walk_t *walk; if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL) nomem(); while ((who_node = uu_avl_walk_next(walk)) != NULL) { const char *who_name = who_node->who_perm.who_name; const char *nice_who_name = who_node->who_perm.who_ug_name; uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; char delim = ' '; deleg_perm_node_t *deleg_node; boolean_t prt_who = B_TRUE; for (deleg_node = uu_avl_first(avl); deleg_node != NULL; deleg_node = uu_avl_next(avl, deleg_node)) { if (local != deleg_node->dpn_perm.dp_local || descend != deleg_node->dpn_perm.dp_descend) continue; if (prt_who) { const char *who = NULL; if (prt_title) { prt_title = B_FALSE; (void) printf("%s", title); } switch (who_type) { case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: who = gettext("user"); if (nice_who_name) who_name = nice_who_name; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: who = gettext("group"); if (nice_who_name) who_name = nice_who_name; break; case ZFS_DELEG_EVERYONE_SETS: case ZFS_DELEG_EVERYONE: who = gettext("everyone"); who_name = NULL; break; default: assert(who != NULL); } prt_who = B_FALSE; if (who_name == NULL) (void) printf("\t%s", who); else (void) printf("\t%s %s", who, who_name); } (void) printf("%c%s", delim, deleg_node->dpn_perm.dp_name); delim = ','; } if (!prt_who) (void) printf("\n"); } uu_avl_walk_end(walk); } static void print_fs_perms(fs_perm_set_t *fspset) { fs_perm_node_t *node = NULL; char buf[MAXNAMELEN + 32]; const char *dsname = buf; for (node = uu_list_first(fspset->fsps_list); node != NULL; node = uu_list_next(fspset->fsps_list, node)) { uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl; uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl; int left = 0; (void) snprintf(buf, sizeof (buf), gettext("---- Permissions on %s "), node->fspn_fsperm.fsp_name); (void) printf("%s", dsname); left = 70 - strlen(buf); while (left-- > 0) (void) printf("-"); (void) printf("\n"); print_set_creat_perms(sc_avl); print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE, gettext("Local permissions:\n")); print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE, gettext("Descendent permissions:\n")); print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE, gettext("Local+Descendent permissions:\n")); } } static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL }; struct deleg_perms { boolean_t un; nvlist_t *nvl; }; static int set_deleg_perms(zfs_handle_t *zhp, void *data) { struct deleg_perms *perms = (struct deleg_perms *)data; zfs_type_t zfs_type = zfs_get_type(zhp); if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME) return (0); return (zfs_set_fsacl(zhp, perms->un, perms->nvl)); } static int zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un) { zfs_handle_t *zhp; nvlist_t *perm_nvl = NULL; nvlist_t *update_perm_nvl = NULL; int error = 1; int c; struct allow_opts opts = { 0 }; const char *optstr = un ? "ldugecsrh" : "ldugecsh"; /* check opts */ while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'l': opts.local = B_TRUE; break; case 'd': opts.descend = B_TRUE; break; case 'u': opts.user = B_TRUE; break; case 'g': opts.group = B_TRUE; break; case 'e': opts.everyone = B_TRUE; break; case 's': opts.set = B_TRUE; break; case 'c': opts.create = B_TRUE; break; case 'r': opts.recursive = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case 'h': opts.prt_usage = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check arguments */ parse_allow_args(argc, argv, un, &opts); /* try to open the dataset */ if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { (void) fprintf(stderr, "Failed to open dataset: %s\n", opts.dataset); return (-1); } if (zfs_get_fsacl(zhp, &perm_nvl) != 0) goto cleanup2; fs_perm_set_init(&fs_perm_set); if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) { (void) fprintf(stderr, "Failed to parse fsacl permissions\n"); goto cleanup1; } if (opts.prt_perms) print_fs_perms(&fs_perm_set); else { (void) construct_fsacl_list(un, &opts, &update_perm_nvl); if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0) goto cleanup0; if (un && opts.recursive) { struct deleg_perms data = { un, update_perm_nvl }; if (zfs_iter_filesystems(zhp, set_deleg_perms, &data) != 0) goto cleanup0; } } error = 0; cleanup0: nvlist_free(perm_nvl); nvlist_free(update_perm_nvl); cleanup1: fs_perm_set_fini(&fs_perm_set); cleanup2: zfs_close(zhp); return (error); } static int zfs_do_allow(int argc, char **argv) { return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE)); } static int zfs_do_unallow(int argc, char **argv) { return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE)); } static int zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) { int errors = 0; int i; const char *tag; boolean_t recursive = B_FALSE; const char *opts = holding ? "rt" : "r"; int c; /* check options */ while ((c = getopt(argc, argv, opts)) != -1) { switch (c) { case 'r': recursive = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 2) usage(B_FALSE); tag = argv[0]; --argc; ++argv; if (holding && tag[0] == '.') { /* tags starting with '.' are reserved for libzfs */ (void) fprintf(stderr, gettext("tag may not start with '.'\n")); usage(B_FALSE); } for (i = 0; i < argc; ++i) { zfs_handle_t *zhp; char parent[ZFS_MAX_DATASET_NAME_LEN]; const char *delim; char *path = argv[i]; delim = strchr(path, '@'); if (delim == NULL) { (void) fprintf(stderr, gettext("'%s' is not a snapshot\n"), path); ++errors; continue; } - (void) strncpy(parent, path, delim - path); - parent[delim - path] = '\0'; + (void) strlcpy(parent, path, MIN(sizeof (parent), + delim - path + 1)); zhp = zfs_open(g_zfs, parent, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { ++errors; continue; } if (holding) { if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0) ++errors; } else { if (zfs_release(zhp, delim+1, tag, recursive) != 0) ++errors; } zfs_close(zhp); } return (errors != 0); } /* * zfs hold [-r] [-t] ... * * -r Recursively hold * * Apply a user-hold with the given tag to the list of snapshots. */ static int zfs_do_hold(int argc, char **argv) { return (zfs_do_hold_rele_impl(argc, argv, B_TRUE)); } /* * zfs release [-r] ... * * -r Recursively release * * Release a user-hold with the given tag from the list of snapshots. */ static int zfs_do_release(int argc, char **argv) { return (zfs_do_hold_rele_impl(argc, argv, B_FALSE)); } typedef struct holds_cbdata { boolean_t cb_recursive; const char *cb_snapname; nvlist_t **cb_nvlp; size_t cb_max_namelen; size_t cb_max_taglen; } holds_cbdata_t; #define STRFTIME_FMT_STR "%a %b %e %H:%M %Y" #define DATETIME_BUF_LEN (32) /* * */ static void print_holds(boolean_t scripted, int nwidth, int tagwidth, nvlist_t *nvl) { int i; nvpair_t *nvp = NULL; const char *const hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" }; const char *col; if (!scripted) { for (i = 0; i < 3; i++) { col = gettext(hdr_cols[i]); if (i < 2) (void) printf("%-*s ", i ? tagwidth : nwidth, col); else (void) printf("%s\n", col); } } while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { char *zname = nvpair_name(nvp); nvlist_t *nvl2; nvpair_t *nvp2 = NULL; (void) nvpair_value_nvlist(nvp, &nvl2); while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) { char tsbuf[DATETIME_BUF_LEN]; const char *tagname = nvpair_name(nvp2); uint64_t val = 0; time_t time; struct tm t; (void) nvpair_value_uint64(nvp2, &val); time = (time_t)val; (void) localtime_r(&time, &t); (void) strftime(tsbuf, DATETIME_BUF_LEN, gettext(STRFTIME_FMT_STR), &t); if (scripted) { (void) printf("%s\t%s\t%s\n", zname, tagname, tsbuf); } else { (void) printf("%-*s %-*s %s\n", nwidth, zname, tagwidth, tagname, tsbuf); } } } } /* * Generic callback function to list a dataset or snapshot. */ static int holds_callback(zfs_handle_t *zhp, void *data) { holds_cbdata_t *cbp = data; nvlist_t *top_nvl = *cbp->cb_nvlp; nvlist_t *nvl = NULL; nvpair_t *nvp = NULL; const char *zname = zfs_get_name(zhp); size_t znamelen = strlen(zname); if (cbp->cb_recursive) { const char *snapname; char *delim = strchr(zname, '@'); if (delim == NULL) return (0); snapname = delim + 1; if (strcmp(cbp->cb_snapname, snapname)) return (0); } if (zfs_get_holds(zhp, &nvl) != 0) return (-1); if (znamelen > cbp->cb_max_namelen) cbp->cb_max_namelen = znamelen; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { const char *tag = nvpair_name(nvp); size_t taglen = strlen(tag); if (taglen > cbp->cb_max_taglen) cbp->cb_max_taglen = taglen; } return (nvlist_add_nvlist(top_nvl, zname, nvl)); } /* * zfs holds [-rH] ... * * -r Lists holds that are set on the named snapshots recursively. * -H Scripted mode; elide headers and separate columns by tabs. */ static int zfs_do_holds(int argc, char **argv) { int c; boolean_t errors = B_FALSE; boolean_t scripted = B_FALSE; boolean_t recursive = B_FALSE; int types = ZFS_TYPE_SNAPSHOT; holds_cbdata_t cb = { 0 }; int limit = 0; int ret = 0; int flags = 0; /* check options */ while ((c = getopt(argc, argv, "rH")) != -1) { switch (c) { case 'r': recursive = B_TRUE; break; case 'H': scripted = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (recursive) { types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; flags |= ZFS_ITER_RECURSE; } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) usage(B_FALSE); nvlist_t *nvl = fnvlist_alloc(); for (int i = 0; i < argc; ++i) { char *snapshot = argv[i]; const char *delim; const char *snapname; delim = strchr(snapshot, '@'); if (delim == NULL) { (void) fprintf(stderr, gettext("'%s' is not a snapshot\n"), snapshot); errors = B_TRUE; continue; } snapname = delim + 1; if (recursive) snapshot[delim - snapshot] = '\0'; cb.cb_recursive = recursive; cb.cb_snapname = snapname; cb.cb_nvlp = &nvl; /* * 1. collect holds data, set format options */ ret = zfs_for_each(1, argv + i, flags, types, NULL, NULL, limit, holds_callback, &cb); if (ret != 0) errors = B_TRUE; } /* * 2. print holds data */ print_holds(scripted, cb.cb_max_namelen, cb.cb_max_taglen, nvl); if (nvlist_empty(nvl)) (void) fprintf(stderr, gettext("no datasets available\n")); nvlist_free(nvl); return (errors); } #define CHECK_SPINNER 30 #define SPINNER_TIME 3 /* seconds */ #define MOUNT_TIME 1 /* seconds */ typedef struct get_all_state { boolean_t ga_verbose; get_all_cb_t *ga_cbp; } get_all_state_t; static int get_one_dataset(zfs_handle_t *zhp, void *data) { static const char *const spin[] = { "-", "\\", "|", "/" }; static int spinval = 0; static int spincheck = 0; static time_t last_spin_time = (time_t)0; get_all_state_t *state = data; zfs_type_t type = zfs_get_type(zhp); if (state->ga_verbose) { if (--spincheck < 0) { time_t now = time(NULL); if (last_spin_time + SPINNER_TIME < now) { update_progress(spin[spinval++ % 4]); last_spin_time = now; } spincheck = CHECK_SPINNER; } } /* * Iterate over any nested datasets. */ if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) { zfs_close(zhp); return (1); } /* * Skip any datasets whose type does not match. */ if ((type & ZFS_TYPE_FILESYSTEM) == 0) { zfs_close(zhp); return (0); } libzfs_add_handle(state->ga_cbp, zhp); assert(state->ga_cbp->cb_used <= state->ga_cbp->cb_alloc); return (0); } static void get_all_datasets(get_all_cb_t *cbp, boolean_t verbose) { get_all_state_t state = { .ga_verbose = verbose, .ga_cbp = cbp }; if (verbose) set_progress_header(gettext("Reading ZFS config")); (void) zfs_iter_root(g_zfs, get_one_dataset, &state); if (verbose) finish_progress(gettext("done.")); } /* * Generic callback for sharing or mounting filesystems. Because the code is so * similar, we have a common function with an extra parameter to determine which * mode we are using. */ typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t; typedef struct share_mount_state { share_mount_op_t sm_op; boolean_t sm_verbose; int sm_flags; char *sm_options; enum sa_protocol sm_proto; /* only valid for OP_SHARE */ pthread_mutex_t sm_lock; /* protects the remaining fields */ uint_t sm_total; /* number of filesystems to process */ uint_t sm_done; /* number of filesystems processed */ int sm_status; /* -1 if any of the share/mount operations failed */ } share_mount_state_t; /* * Share or mount a dataset. */ static int share_mount_one(zfs_handle_t *zhp, int op, int flags, enum sa_protocol protocol, boolean_t explicit, const char *options) { char mountpoint[ZFS_MAXPROPLEN]; char shareopts[ZFS_MAXPROPLEN]; char smbshareopts[ZFS_MAXPROPLEN]; const char *cmdname = op == OP_SHARE ? "share" : "mount"; struct mnttab mnt; uint64_t zoned, canmount; boolean_t shared_nfs, shared_smb; assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM); /* * Check to make sure we can mount/share this dataset. If we * are in the global zone and the filesystem is exported to a * local zone, or if we are in a local zone and the * filesystem is not exported, then it is an error. */ zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); if (zoned && getzoneid() == GLOBAL_ZONEID) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "dataset is exported to a local zone\n"), cmdname, zfs_get_name(zhp)); return (1); } else if (!zoned && getzoneid() != GLOBAL_ZONEID) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "permission denied\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * Ignore any filesystems which don't apply to us. This * includes those with a legacy mountpoint, or those with * legacy share options. */ verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts, sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts, sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0); if (op == OP_SHARE && strcmp(shareopts, "off") == 0 && strcmp(smbshareopts, "off") == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot share '%s': " "legacy share\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use exports(5) or " "smb.conf(5) to share this filesystem, or set " "the sharenfs or sharesmb property\n")); return (1); } /* * We cannot share or mount legacy filesystems. If the * shareopts is non-legacy but the mountpoint is legacy, we * treat it as a legacy share. */ if (strcmp(mountpoint, "legacy") == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "legacy mountpoint\n"), cmdname, zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use %s(8) to " "%s this filesystem\n"), cmdname, cmdname); return (1); } if (strcmp(mountpoint, "none") == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': no " "mountpoint set\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * canmount explicit outcome * on no pass through * on yes pass through * off no return 0 * off yes display error, return 1 * noauto no return 0 * noauto yes pass through */ canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT); if (canmount == ZFS_CANMOUNT_OFF) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "'canmount' property is set to 'off'\n"), cmdname, zfs_get_name(zhp)); return (1); } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) { /* * When performing a 'zfs mount -a', we skip any mounts for * datasets that have 'noauto' set. Sharing a dataset with * 'noauto' set is only allowed if it's mounted. */ if (op == OP_MOUNT) return (0); if (op == OP_SHARE && !zfs_is_mounted(zhp, NULL)) { /* also purge it from existing exports */ zfs_unshare(zhp, mountpoint, NULL); return (0); } } /* * If this filesystem is encrypted and does not have * a loaded key, we can not mount it. */ if ((flags & MS_CRYPT) == 0 && zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF && zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) == ZFS_KEYSTATUS_UNAVAILABLE) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "encryption key not loaded\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * If this filesystem is inconsistent and has a receive resume * token, we can not mount it. */ if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0, NULL, NULL, 0, B_TRUE) == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "Contains partially-completed state from " "\"zfs receive -s\", which can be resumed with " "\"zfs send -t\"\n"), cmdname, zfs_get_name(zhp)); return (1); } if (zfs_prop_get_int(zhp, ZFS_PROP_REDACTED) && !(flags & MS_FORCE)) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "Dataset is not complete, was created by receiving " "a redacted zfs send stream.\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * At this point, we have verified that the mountpoint and/or * shareopts are appropriate for auto management. If the * filesystem is already mounted or shared, return (failing * for explicit requests); otherwise mount or share the * filesystem. */ switch (op) { case OP_SHARE: { enum sa_protocol prot[] = {SA_PROTOCOL_NFS, SA_NO_PROTOCOL}; shared_nfs = zfs_is_shared(zhp, NULL, prot); *prot = SA_PROTOCOL_SMB; shared_smb = zfs_is_shared(zhp, NULL, prot); if ((shared_nfs && shared_smb) || (shared_nfs && strcmp(shareopts, "on") == 0 && strcmp(smbshareopts, "off") == 0) || (shared_smb && strcmp(smbshareopts, "on") == 0 && strcmp(shareopts, "off") == 0)) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot share " "'%s': filesystem already shared\n"), zfs_get_name(zhp)); return (1); } if (!zfs_is_mounted(zhp, NULL) && zfs_mount(zhp, NULL, flags) != 0) return (1); *prot = protocol; if (zfs_share(zhp, protocol == SA_NO_PROTOCOL ? NULL : prot)) return (1); } break; case OP_MOUNT: mnt.mnt_mntopts = (char *)(options ?: ""); if (!hasmntopt(&mnt, MNTOPT_REMOUNT) && zfs_is_mounted(zhp, NULL)) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot mount " "'%s': filesystem already mounted\n"), zfs_get_name(zhp)); return (1); } if (zfs_mount(zhp, options, flags) != 0) return (1); break; } return (0); } /* * Reports progress in the form "(current/total)". Not thread-safe. */ static void report_mount_progress(int current, int total) { static time_t last_progress_time = 0; time_t now = time(NULL); char info[32]; /* display header if we're here for the first time */ if (current == 1) { set_progress_header(gettext("Mounting ZFS filesystems")); } else if (current != total && last_progress_time + MOUNT_TIME >= now) { /* too soon to report again */ return; } last_progress_time = now; (void) sprintf(info, "(%d/%d)", current, total); if (current == total) finish_progress(info); else update_progress(info); } /* * zfs_foreach_mountpoint() callback that mounts or shares one filesystem and * updates the progress meter. */ static int share_mount_one_cb(zfs_handle_t *zhp, void *arg) { share_mount_state_t *sms = arg; int ret; ret = share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto, B_FALSE, sms->sm_options); pthread_mutex_lock(&sms->sm_lock); if (ret != 0) sms->sm_status = ret; sms->sm_done++; if (sms->sm_verbose) report_mount_progress(sms->sm_done, sms->sm_total); pthread_mutex_unlock(&sms->sm_lock); return (ret); } static void append_options(char *mntopts, char *newopts) { int len = strlen(mntopts); /* original length plus new string to append plus 1 for the comma */ if (len + 1 + strlen(newopts) >= MNT_LINE_MAX) { (void) fprintf(stderr, gettext("the opts argument for " "'%s' option is too long (more than %d chars)\n"), "-o", MNT_LINE_MAX); usage(B_FALSE); } if (*mntopts) mntopts[len++] = ','; (void) strcpy(&mntopts[len], newopts); } static enum sa_protocol sa_protocol_decode(const char *protocol) { for (enum sa_protocol i = 0; i < ARRAY_SIZE(sa_protocol_names); ++i) if (strcmp(protocol, sa_protocol_names[i]) == 0) return (i); (void) fputs(gettext("share type must be one of: "), stderr); for (enum sa_protocol i = 0; i < ARRAY_SIZE(sa_protocol_names); ++i) (void) fprintf(stderr, "%s%s", i != 0 ? ", " : "", sa_protocol_names[i]); (void) fputc('\n', stderr); usage(B_FALSE); } static int share_mount(int op, int argc, char **argv) { int do_all = 0; boolean_t verbose = B_FALSE; int c, ret = 0; char *options = NULL; int flags = 0; /* check options */ while ((c = getopt(argc, argv, op == OP_MOUNT ? ":alvo:Of" : "al")) != -1) { switch (c) { case 'a': do_all = 1; break; case 'v': verbose = B_TRUE; break; case 'l': flags |= MS_CRYPT; break; case 'o': if (*optarg == '\0') { (void) fprintf(stderr, gettext("empty mount " "options (-o) specified\n")); usage(B_FALSE); } if (options == NULL) options = safe_malloc(MNT_LINE_MAX + 1); /* option validation is done later */ append_options(options, optarg); break; case 'O': flags |= MS_OVERLAY; break; case 'f': flags |= MS_FORCE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (do_all) { enum sa_protocol protocol = SA_NO_PROTOCOL; if (op == OP_SHARE && argc > 0) { protocol = sa_protocol_decode(argv[0]); argc--; argv++; } if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } start_progress_timer(); get_all_cb_t cb = { 0 }; get_all_datasets(&cb, verbose); if (cb.cb_used == 0) { free(options); return (0); } share_mount_state_t share_mount_state = { 0 }; share_mount_state.sm_op = op; share_mount_state.sm_verbose = verbose; share_mount_state.sm_flags = flags; share_mount_state.sm_options = options; share_mount_state.sm_proto = protocol; share_mount_state.sm_total = cb.cb_used; pthread_mutex_init(&share_mount_state.sm_lock, NULL); /* For a 'zfs share -a' operation start with a clean slate. */ zfs_truncate_shares(NULL); /* * libshare isn't mt-safe, so only do the operation in parallel * if we're mounting. Additionally, the key-loading option must * be serialized so that we can prompt the user for their keys * in a consistent manner. */ zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used, share_mount_one_cb, &share_mount_state, op == OP_MOUNT && !(flags & MS_CRYPT)); zfs_commit_shares(NULL); ret = share_mount_state.sm_status; for (int i = 0; i < cb.cb_used; i++) zfs_close(cb.cb_handles[i]); free(cb.cb_handles); } else if (argc == 0) { FILE *mnttab; struct mnttab entry; if ((op == OP_SHARE) || (options != NULL)) { (void) fprintf(stderr, gettext("missing filesystem " "argument (specify -a for all)\n")); usage(B_FALSE); } /* * When mount is given no arguments, go through * /proc/self/mounts and display any active ZFS mounts. * We hide any snapshots, since they are controlled * automatically. */ if ((mnttab = fopen(MNTTAB, "re")) == NULL) { free(options); return (ENOENT); } while (getmntent(mnttab, &entry) == 0) { if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 || strchr(entry.mnt_special, '@') != NULL) continue; (void) printf("%-30s %s\n", entry.mnt_special, entry.mnt_mountp); } (void) fclose(mnttab); } else { zfs_handle_t *zhp; if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM)) == NULL) { ret = 1; } else { ret = share_mount_one(zhp, op, flags, SA_NO_PROTOCOL, B_TRUE, options); zfs_commit_shares(NULL); zfs_close(zhp); } } free(options); return (ret); } /* * zfs mount -a * zfs mount filesystem * * Mount all filesystems, or mount the given filesystem. */ static int zfs_do_mount(int argc, char **argv) { return (share_mount(OP_MOUNT, argc, argv)); } /* * zfs share -a [nfs | smb] * zfs share filesystem * * Share all filesystems, or share the given filesystem. */ static int zfs_do_share(int argc, char **argv) { return (share_mount(OP_SHARE, argc, argv)); } typedef struct unshare_unmount_node { zfs_handle_t *un_zhp; char *un_mountp; uu_avl_node_t un_avlnode; } unshare_unmount_node_t; static int unshare_unmount_compare(const void *larg, const void *rarg, void *unused) { (void) unused; const unshare_unmount_node_t *l = larg; const unshare_unmount_node_t *r = rarg; return (strcmp(l->un_mountp, r->un_mountp)); } /* * Convenience routine used by zfs_do_umount() and manual_unmount(). Given an * absolute path, find the entry /proc/self/mounts, verify that it's a * ZFS filesystem, and unmount it appropriately. */ static int unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) { zfs_handle_t *zhp; int ret = 0; struct stat64 statbuf; struct extmnttab entry; const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount"; ino_t path_inode; /* * Search for the given (major,minor) pair in the mount table. */ if (getextmntent(path, &entry, &statbuf) != 0) { if (op == OP_SHARE) { (void) fprintf(stderr, gettext("cannot %s '%s': not " "currently mounted\n"), cmdname, path); return (1); } (void) fprintf(stderr, gettext("warning: %s not in" "/proc/self/mounts\n"), path); if ((ret = umount2(path, flags)) != 0) (void) fprintf(stderr, gettext("%s: %s\n"), path, strerror(errno)); return (ret != 0); } path_inode = statbuf.st_ino; if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { (void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS " "filesystem\n"), cmdname, path); return (1); } if ((zhp = zfs_open(g_zfs, entry.mnt_special, ZFS_TYPE_FILESYSTEM)) == NULL) return (1); ret = 1; if (stat64(entry.mnt_mountp, &statbuf) != 0) { (void) fprintf(stderr, gettext("cannot %s '%s': %s\n"), cmdname, path, strerror(errno)); goto out; } else if (statbuf.st_ino != path_inode) { (void) fprintf(stderr, gettext("cannot " "%s '%s': not a mountpoint\n"), cmdname, path); goto out; } if (op == OP_SHARE) { char nfs_mnt_prop[ZFS_MAXPROPLEN]; char smbshare_prop[ZFS_MAXPROPLEN]; verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshare_prop, sizeof (smbshare_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") == 0 && strcmp(smbshare_prop, "off") == 0) { (void) fprintf(stderr, gettext("cannot unshare " "'%s': legacy share\n"), path); (void) fprintf(stderr, gettext("use exportfs(8) " "or smbcontrol(1) to unshare this filesystem\n")); } else if (!zfs_is_shared(zhp, NULL, NULL)) { (void) fprintf(stderr, gettext("cannot unshare '%s': " "not currently shared\n"), path); } else { ret = zfs_unshare(zhp, path, NULL); zfs_commit_shares(NULL); } } else { char mtpt_prop[ZFS_MAXPROPLEN]; verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mtpt_prop, sizeof (mtpt_prop), NULL, NULL, 0, B_FALSE) == 0); if (is_manual) { ret = zfs_unmount(zhp, NULL, flags); } else if (strcmp(mtpt_prop, "legacy") == 0) { (void) fprintf(stderr, gettext("cannot unmount " "'%s': legacy mountpoint\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use umount(8) " "to unmount this filesystem\n")); } else { ret = zfs_unmountall(zhp, flags); } } out: zfs_close(zhp); return (ret != 0); } /* * Generic callback for unsharing or unmounting a filesystem. */ static int unshare_unmount(int op, int argc, char **argv) { int do_all = 0; int flags = 0; int ret = 0; int c; zfs_handle_t *zhp; char nfs_mnt_prop[ZFS_MAXPROPLEN]; char sharesmb[ZFS_MAXPROPLEN]; /* check options */ while ((c = getopt(argc, argv, op == OP_SHARE ? ":a" : "afu")) != -1) { switch (c) { case 'a': do_all = 1; break; case 'f': flags |= MS_FORCE; break; case 'u': flags |= MS_CRYPT; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (do_all) { /* * We could make use of zfs_for_each() to walk all datasets in * the system, but this would be very inefficient, especially * since we would have to linearly search /proc/self/mounts for * each one. Instead, do one pass through /proc/self/mounts * looking for zfs entries and call zfs_unmount() for each one. * * Things get a little tricky if the administrator has created * mountpoints beneath other ZFS filesystems. In this case, we * have to unmount the deepest filesystems first. To accomplish * this, we place all the mountpoints in an AVL tree sorted by * the special type (dataset name), and walk the result in * reverse to make sure to get any snapshots first. */ FILE *mnttab; struct mnttab entry; uu_avl_pool_t *pool; uu_avl_t *tree = NULL; unshare_unmount_node_t *node; uu_avl_index_t idx; uu_avl_walk_t *walk; enum sa_protocol *protocol = NULL, single_protocol[] = {SA_NO_PROTOCOL, SA_NO_PROTOCOL}; if (op == OP_SHARE && argc > 0) { *single_protocol = sa_protocol_decode(argv[0]); protocol = single_protocol; argc--; argv++; } if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (((pool = uu_avl_pool_create("unmount_pool", sizeof (unshare_unmount_node_t), offsetof(unshare_unmount_node_t, un_avlnode), unshare_unmount_compare, UU_DEFAULT)) == NULL) || ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL)) nomem(); if ((mnttab = fopen(MNTTAB, "re")) == NULL) { uu_avl_destroy(tree); uu_avl_pool_destroy(pool); return (ENOENT); } while (getmntent(mnttab, &entry) == 0) { /* ignore non-ZFS entries */ if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) continue; /* ignore snapshots */ if (strchr(entry.mnt_special, '@') != NULL) continue; if ((zhp = zfs_open(g_zfs, entry.mnt_special, ZFS_TYPE_FILESYSTEM)) == NULL) { ret = 1; continue; } /* * Ignore datasets that are excluded/restricted by * parent pool name. */ if (zpool_skip_pool(zfs_get_pool_name(zhp))) { zfs_close(zhp); continue; } switch (op) { case OP_SHARE: verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") != 0) break; verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") == 0) continue; break; case OP_MOUNT: /* Ignore legacy mounts */ verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "legacy") == 0) continue; /* Ignore canmount=noauto mounts */ if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_NOAUTO) continue; break; default: break; } node = safe_malloc(sizeof (unshare_unmount_node_t)); node->un_zhp = zhp; node->un_mountp = safe_strdup(entry.mnt_mountp); uu_avl_node_init(node, &node->un_avlnode, pool); if (uu_avl_find(tree, node, NULL, &idx) == NULL) { uu_avl_insert(tree, node, idx); } else { zfs_close(node->un_zhp); free(node->un_mountp); free(node); } } (void) fclose(mnttab); /* * Walk the AVL tree in reverse, unmounting each filesystem and * removing it from the AVL tree in the process. */ if ((walk = uu_avl_walk_start(tree, UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) nomem(); while ((node = uu_avl_walk_next(walk)) != NULL) { const char *mntarg = NULL; uu_avl_remove(tree, node); switch (op) { case OP_SHARE: if (zfs_unshare(node->un_zhp, node->un_mountp, protocol) != 0) ret = 1; break; case OP_MOUNT: if (zfs_unmount(node->un_zhp, mntarg, flags) != 0) ret = 1; break; } zfs_close(node->un_zhp); free(node->un_mountp); free(node); } if (op == OP_SHARE) zfs_commit_shares(protocol); uu_avl_walk_end(walk); uu_avl_destroy(tree); uu_avl_pool_destroy(pool); } else { if (argc != 1) { if (argc == 0) (void) fprintf(stderr, gettext("missing filesystem argument\n")); else (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* * We have an argument, but it may be a full path or a ZFS * filesystem. Pass full paths off to unmount_path() (shared by * manual_unmount), otherwise open the filesystem and pass to * zfs_unmount(). */ if (argv[0][0] == '/') return (unshare_unmount_path(op, argv[0], flags, B_FALSE)); if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM)) == NULL) return (1); verify(zfs_prop_get(zhp, op == OP_SHARE ? ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); switch (op) { case OP_SHARE: verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, sharesmb, sizeof (sharesmb), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") == 0 && strcmp(sharesmb, "off") == 0) { (void) fprintf(stderr, gettext("cannot " "unshare '%s': legacy share\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use " "exports(5) or smb.conf(5) to unshare " "this filesystem\n")); ret = 1; } else if (!zfs_is_shared(zhp, NULL, NULL)) { (void) fprintf(stderr, gettext("cannot " "unshare '%s': not currently " "shared\n"), zfs_get_name(zhp)); ret = 1; } else if (zfs_unshareall(zhp, NULL) != 0) { ret = 1; } break; case OP_MOUNT: if (strcmp(nfs_mnt_prop, "legacy") == 0) { (void) fprintf(stderr, gettext("cannot " "unmount '%s': legacy " "mountpoint\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use " "umount(8) to unmount this " "filesystem\n")); ret = 1; } else if (!zfs_is_mounted(zhp, NULL)) { (void) fprintf(stderr, gettext("cannot " "unmount '%s': not currently " "mounted\n"), zfs_get_name(zhp)); ret = 1; } else if (zfs_unmountall(zhp, flags) != 0) { ret = 1; } break; } zfs_close(zhp); } return (ret); } /* * zfs unmount [-fu] -a * zfs unmount [-fu] filesystem * * Unmount all filesystems, or a specific ZFS filesystem. */ static int zfs_do_unmount(int argc, char **argv) { return (unshare_unmount(OP_MOUNT, argc, argv)); } /* * zfs unshare -a * zfs unshare filesystem * * Unshare all filesystems, or a specific ZFS filesystem. */ static int zfs_do_unshare(int argc, char **argv) { return (unshare_unmount(OP_SHARE, argc, argv)); } static int find_command_idx(const char *command, int *idx) { int i; for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) continue; if (strcmp(command, command_table[i].name) == 0) { *idx = i; return (0); } } return (1); } static int zfs_do_diff(int argc, char **argv) { zfs_handle_t *zhp; int flags = 0; char *tosnap = NULL; char *fromsnap = NULL; char *atp, *copy; int err = 0; int c; struct sigaction sa; while ((c = getopt(argc, argv, "FHth")) != -1) { switch (c) { case 'F': flags |= ZFS_DIFF_CLASSIFY; break; case 'H': flags |= ZFS_DIFF_PARSEABLE; break; case 't': flags |= ZFS_DIFF_TIMESTAMP; break; case 'h': flags |= ZFS_DIFF_NO_MANGLE; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("must provide at least one snapshot name\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } fromsnap = argv[0]; tosnap = (argc == 2) ? argv[1] : NULL; copy = NULL; if (*fromsnap != '@') copy = strdup(fromsnap); else if (tosnap) copy = strdup(tosnap); if (copy == NULL) usage(B_FALSE); if ((atp = strchr(copy, '@')) != NULL) *atp = '\0'; if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL) { free(copy); return (1); } free(copy); /* * Ignore SIGPIPE so that the library can give us * information on any failure */ if (sigemptyset(&sa.sa_mask) == -1) { err = errno; goto out; } sa.sa_flags = 0; sa.sa_handler = SIG_IGN; if (sigaction(SIGPIPE, &sa, NULL) == -1) { err = errno; goto out; } err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags); out: zfs_close(zhp); return (err != 0); } /* * zfs bookmark | * * Creates a bookmark with the given name from the source snapshot * or creates a copy of an existing source bookmark. */ static int zfs_do_bookmark(int argc, char **argv) { char *source, *bookname; char expbuf[ZFS_MAX_DATASET_NAME_LEN]; int source_type; nvlist_t *nvl; int ret = 0; int c; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing source argument\n")); goto usage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing bookmark argument\n")); goto usage; } source = argv[0]; bookname = argv[1]; if (strchr(source, '@') == NULL && strchr(source, '#') == NULL) { (void) fprintf(stderr, gettext("invalid source name '%s': " "must contain a '@' or '#'\n"), source); goto usage; } if (strchr(bookname, '#') == NULL) { (void) fprintf(stderr, gettext("invalid bookmark name '%s': " "must contain a '#'\n"), bookname); goto usage; } /* * expand source or bookname to full path: * one of them may be specified as short name */ { char **expand; char *source_short, *bookname_short; source_short = strpbrk(source, "@#"); bookname_short = strpbrk(bookname, "#"); if (source_short == source && bookname_short == bookname) { (void) fprintf(stderr, gettext( "either source or bookmark must be specified as " "full dataset paths")); goto usage; } else if (source_short != source && bookname_short != bookname) { expand = NULL; } else if (source_short != source) { strlcpy(expbuf, source, sizeof (expbuf)); expand = &bookname; } else if (bookname_short != bookname) { strlcpy(expbuf, bookname, sizeof (expbuf)); expand = &source; } else { abort(); } if (expand != NULL) { *strpbrk(expbuf, "@#") = '\0'; /* dataset name in buf */ (void) strlcat(expbuf, *expand, sizeof (expbuf)); *expand = expbuf; } } /* determine source type */ switch (*strpbrk(source, "@#")) { case '@': source_type = ZFS_TYPE_SNAPSHOT; break; case '#': source_type = ZFS_TYPE_BOOKMARK; break; default: abort(); } /* test the source exists */ zfs_handle_t *zhp; zhp = zfs_open(g_zfs, source, source_type); if (zhp == NULL) goto usage; zfs_close(zhp); nvl = fnvlist_alloc(); fnvlist_add_string(nvl, bookname, source); ret = lzc_bookmark(nvl, NULL); fnvlist_free(nvl); if (ret != 0) { const char *err_msg = NULL; char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create bookmark '%s'"), bookname); switch (ret) { case EXDEV: err_msg = "bookmark is in a different pool"; break; case ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR: err_msg = "source is not an ancestor of the " "new bookmark's dataset"; break; case EEXIST: err_msg = "bookmark exists"; break; case EINVAL: err_msg = "invalid argument"; break; case ENOTSUP: err_msg = "bookmark feature not enabled"; break; case ENOSPC: err_msg = "out of space"; break; case ENOENT: err_msg = "dataset does not exist"; break; default: (void) zfs_standard_error(g_zfs, ret, errbuf); break; } if (err_msg != NULL) { (void) fprintf(stderr, "%s: %s\n", errbuf, dgettext(TEXT_DOMAIN, err_msg)); } } return (ret != 0); usage: usage(B_FALSE); return (-1); } static int zfs_do_channel_program(int argc, char **argv) { int ret, fd, c; size_t progsize, progread; nvlist_t *outnvl = NULL; uint64_t instrlimit = ZCP_DEFAULT_INSTRLIMIT; uint64_t memlimit = ZCP_DEFAULT_MEMLIMIT; boolean_t sync_flag = B_TRUE, json_output = B_FALSE; zpool_handle_t *zhp; /* check options */ while ((c = getopt(argc, argv, "nt:m:j")) != -1) { switch (c) { case 't': case 'm': { uint64_t arg; char *endp; errno = 0; arg = strtoull(optarg, &endp, 0); if (errno != 0 || *endp != '\0') { (void) fprintf(stderr, gettext( "invalid argument " "'%s': expected integer\n"), optarg); goto usage; } if (c == 't') { instrlimit = arg; } else { ASSERT3U(c, ==, 'm'); memlimit = arg; } break; } case 'n': { sync_flag = B_FALSE; break; } case 'j': { json_output = B_TRUE; break; } case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; if (argc < 2) { (void) fprintf(stderr, gettext("invalid number of arguments\n")); goto usage; } const char *poolname = argv[0]; const char *filename = argv[1]; if (strcmp(filename, "-") == 0) { fd = 0; filename = "standard input"; } else if ((fd = open(filename, O_RDONLY)) < 0) { (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), filename, strerror(errno)); return (1); } if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { (void) fprintf(stderr, gettext("cannot open pool '%s'\n"), poolname); if (fd != 0) (void) close(fd); return (1); } zpool_close(zhp); /* * Read in the channel program, expanding the program buffer as * necessary. */ progread = 0; progsize = 1024; char *progbuf = safe_malloc(progsize); do { ret = read(fd, progbuf + progread, progsize - progread); progread += ret; if (progread == progsize && ret > 0) { progsize *= 2; progbuf = safe_realloc(progbuf, progsize); } } while (ret > 0); if (fd != 0) (void) close(fd); if (ret < 0) { free(progbuf); (void) fprintf(stderr, gettext("cannot read '%s': %s\n"), filename, strerror(errno)); return (1); } progbuf[progread] = '\0'; /* * Any remaining arguments are passed as arguments to the lua script as * a string array: * { * "argv" -> [ "arg 1", ... "arg n" ], * } */ nvlist_t *argnvl = fnvlist_alloc(); fnvlist_add_string_array(argnvl, ZCP_ARG_CLIARGV, (const char **)argv + 2, argc - 2); if (sync_flag) { ret = lzc_channel_program(poolname, progbuf, instrlimit, memlimit, argnvl, &outnvl); } else { ret = lzc_channel_program_nosync(poolname, progbuf, instrlimit, memlimit, argnvl, &outnvl); } if (ret != 0) { /* * On error, report the error message handed back by lua if one * exists. Otherwise, generate an appropriate error message, * falling back on strerror() for an unexpected return code. */ const char *errstring = NULL; const char *msg = gettext("Channel program execution failed"); uint64_t instructions = 0; if (outnvl != NULL && nvlist_exists(outnvl, ZCP_RET_ERROR)) { char *es = NULL; (void) nvlist_lookup_string(outnvl, ZCP_RET_ERROR, &es); if (es == NULL) errstring = strerror(ret); else errstring = es; if (ret == ETIME) { (void) nvlist_lookup_uint64(outnvl, ZCP_ARG_INSTRLIMIT, &instructions); } } else { switch (ret) { case EINVAL: errstring = "Invalid instruction or memory limit."; break; case ENOMEM: errstring = "Return value too large."; break; case ENOSPC: errstring = "Memory limit exhausted."; break; case ETIME: errstring = "Timed out."; break; case EPERM: errstring = "Permission denied. Channel " "programs must be run as root."; break; default: (void) zfs_standard_error(g_zfs, ret, msg); } } if (errstring != NULL) (void) fprintf(stderr, "%s:\n%s\n", msg, errstring); if (ret == ETIME && instructions != 0) (void) fprintf(stderr, gettext("%llu Lua instructions\n"), (u_longlong_t)instructions); } else { if (json_output) { (void) nvlist_print_json(stdout, outnvl); } else if (nvlist_empty(outnvl)) { (void) fprintf(stdout, gettext("Channel program fully " "executed and did not produce output.\n")); } else { (void) fprintf(stdout, gettext("Channel program fully " "executed and produced output:\n")); dump_nvlist(outnvl, 4); } } free(progbuf); fnvlist_free(outnvl); fnvlist_free(argnvl); return (ret != 0); usage: usage(B_FALSE); return (-1); } typedef struct loadkey_cbdata { boolean_t cb_loadkey; boolean_t cb_recursive; boolean_t cb_noop; char *cb_keylocation; uint64_t cb_numfailed; uint64_t cb_numattempted; } loadkey_cbdata_t; static int load_key_callback(zfs_handle_t *zhp, void *data) { int ret; boolean_t is_encroot; loadkey_cbdata_t *cb = data; uint64_t keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); /* * If we are working recursively, we want to skip loading / unloading * keys for non-encryption roots and datasets whose keys are already * in the desired end-state. */ if (cb->cb_recursive) { ret = zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); if (ret != 0) return (ret); if (!is_encroot) return (0); if ((cb->cb_loadkey && keystatus == ZFS_KEYSTATUS_AVAILABLE) || (!cb->cb_loadkey && keystatus == ZFS_KEYSTATUS_UNAVAILABLE)) return (0); } cb->cb_numattempted++; if (cb->cb_loadkey) ret = zfs_crypto_load_key(zhp, cb->cb_noop, cb->cb_keylocation); else ret = zfs_crypto_unload_key(zhp); if (ret != 0) { cb->cb_numfailed++; return (ret); } return (0); } static int load_unload_keys(int argc, char **argv, boolean_t loadkey) { int c, ret = 0, flags = 0; boolean_t do_all = B_FALSE; loadkey_cbdata_t cb = { 0 }; cb.cb_loadkey = loadkey; while ((c = getopt(argc, argv, "anrL:")) != -1) { /* noop and alternate keylocations only apply to zfs load-key */ if (loadkey) { switch (c) { case 'n': cb.cb_noop = B_TRUE; continue; case 'L': cb.cb_keylocation = optarg; continue; default: break; } } switch (c) { case 'a': do_all = B_TRUE; cb.cb_recursive = B_TRUE; break; case 'r': flags |= ZFS_ITER_RECURSE; cb.cb_recursive = B_TRUE; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (!do_all && argc == 0) { (void) fprintf(stderr, gettext("Missing dataset argument or -a option\n")); usage(B_FALSE); } if (do_all && argc != 0) { (void) fprintf(stderr, gettext("Cannot specify dataset with -a option\n")); usage(B_FALSE); } if (cb.cb_recursive && cb.cb_keylocation != NULL && strcmp(cb.cb_keylocation, "prompt") != 0) { (void) fprintf(stderr, gettext("alternate keylocation may only " "be 'prompt' with -r or -a\n")); usage(B_FALSE); } ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, NULL, NULL, 0, load_key_callback, &cb); if (cb.cb_noop || (cb.cb_recursive && cb.cb_numattempted != 0)) { (void) printf(gettext("%llu / %llu key(s) successfully %s\n"), (u_longlong_t)(cb.cb_numattempted - cb.cb_numfailed), (u_longlong_t)cb.cb_numattempted, loadkey ? (cb.cb_noop ? "verified" : "loaded") : "unloaded"); } if (cb.cb_numfailed != 0) ret = -1; return (ret); } static int zfs_do_load_key(int argc, char **argv) { return (load_unload_keys(argc, argv, B_TRUE)); } static int zfs_do_unload_key(int argc, char **argv) { return (load_unload_keys(argc, argv, B_FALSE)); } static int zfs_do_change_key(int argc, char **argv) { int c, ret; uint64_t keystatus; boolean_t loadkey = B_FALSE, inheritkey = B_FALSE; zfs_handle_t *zhp = NULL; nvlist_t *props = fnvlist_alloc(); while ((c = getopt(argc, argv, "lio:")) != -1) { switch (c) { case 'l': loadkey = B_TRUE; break; case 'i': inheritkey = B_TRUE; break; case 'o': if (!parseprop(props, optarg)) { nvlist_free(props); return (1); } break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (inheritkey && !nvlist_empty(props)) { (void) fprintf(stderr, gettext("Properties not allowed for inheriting\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("Missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("Too many arguments\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[argc - 1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) usage(B_FALSE); if (loadkey) { keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); if (keystatus != ZFS_KEYSTATUS_AVAILABLE) { ret = zfs_crypto_load_key(zhp, B_FALSE, NULL); if (ret != 0) { nvlist_free(props); zfs_close(zhp); return (-1); } } /* refresh the properties so the new keystatus is visible */ zfs_refresh_properties(zhp); } ret = zfs_crypto_rewrap(zhp, props, inheritkey); if (ret != 0) { nvlist_free(props); zfs_close(zhp); return (-1); } nvlist_free(props); zfs_close(zhp); return (0); } /* * 1) zfs project [-d|-r] * List project ID and inherit flag of file(s) or directories. * -d: List the directory itself, not its children. * -r: List subdirectories recursively. * * 2) zfs project -C [-k] [-r] * Clear project inherit flag and/or ID on the file(s) or directories. * -k: Keep the project ID unchanged. If not specified, the project ID * will be reset as zero. * -r: Clear on subdirectories recursively. * * 3) zfs project -c [-0] [-d|-r] [-p id] * Check project ID and inherit flag on the file(s) or directories, * report the outliers. * -0: Print file name followed by a NUL instead of newline. * -d: Check the directory itself, not its children. * -p: Specify the referenced ID for comparing with the target file(s) * or directories' project IDs. If not specified, the target (top) * directory's project ID will be used as the referenced one. * -r: Check subdirectories recursively. * * 4) zfs project [-p id] [-r] [-s] * Set project ID and/or inherit flag on the file(s) or directories. * -p: Set the project ID as the given id. * -r: Set on subdirectories recursively. If not specify "-p" option, * it will use top-level directory's project ID as the given id, * then set both project ID and inherit flag on all descendants * of the top-level directory. * -s: Set project inherit flag. */ static int zfs_do_project(int argc, char **argv) { zfs_project_control_t zpc = { .zpc_expected_projid = ZFS_INVALID_PROJID, .zpc_op = ZFS_PROJECT_OP_DEFAULT, .zpc_dironly = B_FALSE, .zpc_keep_projid = B_FALSE, .zpc_newline = B_TRUE, .zpc_recursive = B_FALSE, .zpc_set_flag = B_FALSE, }; int ret = 0, c; if (argc < 2) usage(B_FALSE); while ((c = getopt(argc, argv, "0Ccdkp:rs")) != -1) { switch (c) { case '0': zpc.zpc_newline = B_FALSE; break; case 'C': if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { (void) fprintf(stderr, gettext("cannot " "specify '-C' '-c' '-s' together\n")); usage(B_FALSE); } zpc.zpc_op = ZFS_PROJECT_OP_CLEAR; break; case 'c': if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { (void) fprintf(stderr, gettext("cannot " "specify '-C' '-c' '-s' together\n")); usage(B_FALSE); } zpc.zpc_op = ZFS_PROJECT_OP_CHECK; break; case 'd': zpc.zpc_dironly = B_TRUE; /* overwrite "-r" option */ zpc.zpc_recursive = B_FALSE; break; case 'k': zpc.zpc_keep_projid = B_TRUE; break; case 'p': { char *endptr; errno = 0; zpc.zpc_expected_projid = strtoull(optarg, &endptr, 0); if (errno != 0 || *endptr != '\0') { (void) fprintf(stderr, gettext("project ID must be less than " "%u\n"), UINT32_MAX); usage(B_FALSE); } if (zpc.zpc_expected_projid >= UINT32_MAX) { (void) fprintf(stderr, gettext("invalid project ID\n")); usage(B_FALSE); } break; } case 'r': zpc.zpc_recursive = B_TRUE; /* overwrite "-d" option */ zpc.zpc_dironly = B_FALSE; break; case 's': if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { (void) fprintf(stderr, gettext("cannot " "specify '-C' '-c' '-s' together\n")); usage(B_FALSE); } zpc.zpc_set_flag = B_TRUE; zpc.zpc_op = ZFS_PROJECT_OP_SET; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (zpc.zpc_op == ZFS_PROJECT_OP_DEFAULT) { if (zpc.zpc_expected_projid != ZFS_INVALID_PROJID) zpc.zpc_op = ZFS_PROJECT_OP_SET; else zpc.zpc_op = ZFS_PROJECT_OP_LIST; } switch (zpc.zpc_op) { case ZFS_PROJECT_OP_LIST: if (zpc.zpc_keep_projid) { (void) fprintf(stderr, gettext("'-k' is only valid together with '-C'\n")); usage(B_FALSE); } if (!zpc.zpc_newline) { (void) fprintf(stderr, gettext("'-0' is only valid together with '-c'\n")); usage(B_FALSE); } break; case ZFS_PROJECT_OP_CHECK: if (zpc.zpc_keep_projid) { (void) fprintf(stderr, gettext("'-k' is only valid together with '-C'\n")); usage(B_FALSE); } break; case ZFS_PROJECT_OP_CLEAR: if (zpc.zpc_dironly) { (void) fprintf(stderr, gettext("'-d' is useless together with '-C'\n")); usage(B_FALSE); } if (!zpc.zpc_newline) { (void) fprintf(stderr, gettext("'-0' is only valid together with '-c'\n")); usage(B_FALSE); } if (zpc.zpc_expected_projid != ZFS_INVALID_PROJID) { (void) fprintf(stderr, gettext("'-p' is useless together with '-C'\n")); usage(B_FALSE); } break; case ZFS_PROJECT_OP_SET: if (zpc.zpc_dironly) { (void) fprintf(stderr, gettext("'-d' is useless for set project ID and/or " "inherit flag\n")); usage(B_FALSE); } if (zpc.zpc_keep_projid) { (void) fprintf(stderr, gettext("'-k' is only valid together with '-C'\n")); usage(B_FALSE); } if (!zpc.zpc_newline) { (void) fprintf(stderr, gettext("'-0' is only valid together with '-c'\n")); usage(B_FALSE); } break; default: ASSERT(0); break; } argv += optind; argc -= optind; if (argc == 0) { (void) fprintf(stderr, gettext("missing file or directory target(s)\n")); usage(B_FALSE); } for (int i = 0; i < argc; i++) { int err; err = zfs_project_handle(argv[i], &zpc); if (err && !ret) ret = err; } return (ret); } static int zfs_do_wait(int argc, char **argv) { boolean_t enabled[ZFS_WAIT_NUM_ACTIVITIES]; int error, i; int c; /* By default, wait for all types of activity. */ for (i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) enabled[i] = B_TRUE; while ((c = getopt(argc, argv, "t:")) != -1) { switch (c) { case 't': /* Reset activities array */ memset(&enabled, 0, sizeof (enabled)); for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const col_subopts[ ZFS_WAIT_NUM_ACTIVITIES] = { "deleteq" }; for (i = 0; i < ARRAY_SIZE(col_subopts); ++i) if (strcmp(tok, col_subopts[i]) == 0) { enabled[i] = B_TRUE; goto found; } (void) fprintf(stderr, gettext("invalid activity '%s'\n"), tok); usage(B_FALSE); found:; } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argv += optind; argc -= optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing 'filesystem' " "argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } zfs_handle_t *zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM); if (zhp == NULL) return (1); for (;;) { boolean_t missing = B_FALSE; boolean_t any_waited = B_FALSE; for (int i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) { boolean_t waited; if (!enabled[i]) continue; error = zfs_wait_status(zhp, i, &missing, &waited); if (error != 0 || missing) break; any_waited = (any_waited || waited); } if (error != 0 || missing || !any_waited) break; } zfs_close(zhp); return (error); } /* * Display version message */ static int zfs_do_version(int argc, char **argv) { (void) argc, (void) argv; return (zfs_version_print() != 0); } int main(int argc, char **argv) { int ret = 0; int i = 0; const char *cmdname; char **newargv; (void) setlocale(LC_ALL, ""); (void) setlocale(LC_NUMERIC, "C"); (void) textdomain(TEXT_DOMAIN); opterr = 0; /* * Make sure the user has specified some command. */ if (argc < 2) { (void) fprintf(stderr, gettext("missing command\n")); usage(B_FALSE); } cmdname = argv[1]; /* * The 'umount' command is an alias for 'unmount' */ if (strcmp(cmdname, "umount") == 0) cmdname = "unmount"; /* * The 'recv' command is an alias for 'receive' */ if (strcmp(cmdname, "recv") == 0) cmdname = "receive"; /* * The 'snap' command is an alias for 'snapshot' */ if (strcmp(cmdname, "snap") == 0) cmdname = "snapshot"; /* * Special case '-?' */ if ((strcmp(cmdname, "-?") == 0) || (strcmp(cmdname, "--help") == 0)) usage(B_TRUE); /* * Special case '-V|--version' */ if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) return (zfs_do_version(argc, argv)); if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (1); } zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); libzfs_print_on_error(g_zfs, B_TRUE); /* * Many commands modify input strings for string parsing reasons. * We create a copy to protect the original argv. */ newargv = safe_malloc((argc + 1) * sizeof (newargv[0])); for (i = 0; i < argc; i++) newargv[i] = strdup(argv[i]); newargv[argc] = NULL; /* * Run the appropriate command. */ libzfs_mnttab_cache(g_zfs, B_TRUE); if (find_command_idx(cmdname, &i) == 0) { current_command = &command_table[i]; ret = command_table[i].func(argc - 1, newargv + 1); } else if (strchr(cmdname, '=') != NULL) { verify(find_command_idx("set", &i) == 0); current_command = &command_table[i]; ret = command_table[i].func(argc, newargv); } else { (void) fprintf(stderr, gettext("unrecognized " "command '%s'\n"), cmdname); usage(B_FALSE); ret = 1; } for (i = 0; i < argc; i++) free(newargv[i]); free(newargv); if (ret == 0 && log_history) (void) zpool_log_history(g_zfs, history_str); libzfs_fini(g_zfs); /* * The 'ZFS_ABORT' environment variable causes us to dump core on exit * for the purposes of running ::findleaks. */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } return (ret); } /* * zfs zone nsfile filesystem * * Add or delete the given dataset to/from the namespace. */ #ifdef __linux__ static int zfs_do_zone_impl(int argc, char **argv, boolean_t attach) { zfs_handle_t *zhp; int ret; if (argc < 3) { (void) fprintf(stderr, gettext("missing argument(s)\n")); usage(B_FALSE); } if (argc > 3) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM); if (zhp == NULL) return (1); ret = (zfs_userns(zhp, argv[1], attach) != 0); zfs_close(zhp); return (ret); } static int zfs_do_zone(int argc, char **argv) { return (zfs_do_zone_impl(argc, argv, B_TRUE)); } static int zfs_do_unzone(int argc, char **argv) { return (zfs_do_zone_impl(argc, argv, B_FALSE)); } #endif #ifdef __FreeBSD__ #include #include /* * Attach/detach the given dataset to/from the given jail */ static int zfs_do_jail_impl(int argc, char **argv, boolean_t attach) { zfs_handle_t *zhp; int jailid, ret; /* check number of arguments */ if (argc < 3) { (void) fprintf(stderr, gettext("missing argument(s)\n")); usage(B_FALSE); } if (argc > 3) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } jailid = jail_getid(argv[1]); if (jailid < 0) { (void) fprintf(stderr, gettext("invalid jail id or name\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM); if (zhp == NULL) return (1); ret = (zfs_jail(zhp, jailid, attach) != 0); zfs_close(zhp); return (ret); } /* * zfs jail jailid filesystem * * Attach the given dataset to the given jail */ static int zfs_do_jail(int argc, char **argv) { return (zfs_do_jail_impl(argc, argv, B_TRUE)); } /* * zfs unjail jailid filesystem * * Detach the given dataset from the given jail */ static int zfs_do_unjail(int argc, char **argv) { return (zfs_do_jail_impl(argc, argv, B_FALSE)); } #endif diff --git a/cmd/ztest.c b/cmd/ztest.c index b5a3d1810a5a..55020805d14f 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -1,8288 +1,8289 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. */ /* * The objective of this program is to provide a DMU/ZAP/SPA stress test * that runs entirely in userland, is easy to use, and easy to extend. * * The overall design of the ztest program is as follows: * * (1) For each major functional area (e.g. adding vdevs to a pool, * creating and destroying datasets, reading and writing objects, etc) * we have a simple routine to test that functionality. These * individual routines do not have to do anything "stressful". * * (2) We turn these simple functionality tests into a stress test by * running them all in parallel, with as many threads as desired, * and spread across as many datasets, objects, and vdevs as desired. * * (3) While all this is happening, we inject faults into the pool to * verify that self-healing data really works. * * (4) Every time we open a dataset, we change its checksum and compression * functions. Thus even individual objects vary from block to block * in which checksum they use and whether they're compressed. * * (5) To verify that we never lose on-disk consistency after a crash, * we run the entire test in a child of the main process. * At random times, the child self-immolates with a SIGKILL. * This is the software equivalent of pulling the power cord. * The parent then runs the test again, using the existing * storage pool, as many times as desired. If backwards compatibility * testing is enabled ztest will sometimes run the "older" version * of ztest after a SIGKILL. * * (6) To verify that we don't have future leaks or temporal incursions, * many of the functional tests record the transaction group number * as part of their data. When reading old data, they verify that * the transaction group number is less than the current, open txg. * If you add a new test, please do this if applicable. * * (7) Threads are created with a reduced stack size, for sanity checking. * Therefore, it's important not to allocate huge buffers on the stack. * * When run with no arguments, ztest runs for about five minutes and * produces no output if successful. To get a little bit of information, * specify -V. To get more information, specify -VV, and so on. * * To turn this into an overnight stress test, use -T to specify run time. * * You can ask more vdevs [-v], datasets [-d], or threads [-t] * to increase the pool capacity, fanout, and overall stress level. * * Use the -k option to set the desired frequency of kills. * * When ztest invokes itself it passes all relevant information through a * temporary file which is mmap-ed in the child process. This allows shared * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always * stored at offset 0 of this file and contains information on the size and * number of shared structures in the file. The information stored in this file * must remain backwards compatible with older versions of ztest so that * ztest can invoke them during backwards compatibility testing (-B). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if (__GLIBC__ && !__UCLIBC__) #include /* for backtrace() */ #endif static int ztest_fd_data = -1; static int ztest_fd_rand = -1; typedef struct ztest_shared_hdr { uint64_t zh_hdr_size; uint64_t zh_opts_size; uint64_t zh_size; uint64_t zh_stats_size; uint64_t zh_stats_count; uint64_t zh_ds_size; uint64_t zh_ds_count; } ztest_shared_hdr_t; static ztest_shared_hdr_t *ztest_shared_hdr; enum ztest_class_state { ZTEST_VDEV_CLASS_OFF, ZTEST_VDEV_CLASS_ON, ZTEST_VDEV_CLASS_RND }; #define ZO_GVARS_MAX_ARGLEN ((size_t)64) #define ZO_GVARS_MAX_COUNT ((size_t)10) typedef struct ztest_shared_opts { char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; char zo_alt_ztest[MAXNAMELEN]; char zo_alt_libpath[MAXNAMELEN]; uint64_t zo_vdevs; uint64_t zo_vdevtime; size_t zo_vdev_size; int zo_ashift; int zo_mirrors; int zo_raid_children; int zo_raid_parity; char zo_raid_type[8]; int zo_draid_data; int zo_draid_spares; int zo_datasets; int zo_threads; uint64_t zo_passtime; uint64_t zo_killrate; int zo_verbose; int zo_init; uint64_t zo_time; uint64_t zo_maxloops; uint64_t zo_metaslab_force_ganging; int zo_mmp_test; int zo_special_vdevs; int zo_dump_dbgmsg; int zo_gvars_count; char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; } ztest_shared_opts_t; /* Default values for command line options. */ #define DEFAULT_POOL "ztest" #define DEFAULT_VDEV_DIR "/tmp" #define DEFAULT_VDEV_COUNT 5 #define DEFAULT_VDEV_SIZE (SPA_MINDEVSIZE * 4) /* 256m default size */ #define DEFAULT_VDEV_SIZE_STR "256M" #define DEFAULT_ASHIFT SPA_MINBLOCKSHIFT #define DEFAULT_MIRRORS 2 #define DEFAULT_RAID_CHILDREN 4 #define DEFAULT_RAID_PARITY 1 #define DEFAULT_DRAID_DATA 4 #define DEFAULT_DRAID_SPARES 1 #define DEFAULT_DATASETS_COUNT 7 #define DEFAULT_THREADS 23 #define DEFAULT_RUN_TIME 300 /* 300 seconds */ #define DEFAULT_RUN_TIME_STR "300 sec" #define DEFAULT_PASS_TIME 60 /* 60 seconds */ #define DEFAULT_PASS_TIME_STR "60 sec" #define DEFAULT_KILL_RATE 70 /* 70% kill rate */ #define DEFAULT_KILLRATE_STR "70%" #define DEFAULT_INITS 1 #define DEFAULT_MAX_LOOPS 50 /* 5 minutes */ #define DEFAULT_FORCE_GANGING (64 << 10) #define DEFAULT_FORCE_GANGING_STR "64K" /* Simplifying assumption: -1 is not a valid default. */ #define NO_DEFAULT -1 static const ztest_shared_opts_t ztest_opts_defaults = { .zo_pool = DEFAULT_POOL, .zo_dir = DEFAULT_VDEV_DIR, .zo_alt_ztest = { '\0' }, .zo_alt_libpath = { '\0' }, .zo_vdevs = DEFAULT_VDEV_COUNT, .zo_ashift = DEFAULT_ASHIFT, .zo_mirrors = DEFAULT_MIRRORS, .zo_raid_children = DEFAULT_RAID_CHILDREN, .zo_raid_parity = DEFAULT_RAID_PARITY, .zo_raid_type = VDEV_TYPE_RAIDZ, .zo_vdev_size = DEFAULT_VDEV_SIZE, .zo_draid_data = DEFAULT_DRAID_DATA, /* data drives */ .zo_draid_spares = DEFAULT_DRAID_SPARES, /* distributed spares */ .zo_datasets = DEFAULT_DATASETS_COUNT, .zo_threads = DEFAULT_THREADS, .zo_passtime = DEFAULT_PASS_TIME, .zo_killrate = DEFAULT_KILL_RATE, .zo_verbose = 0, .zo_mmp_test = 0, .zo_init = DEFAULT_INITS, .zo_time = DEFAULT_RUN_TIME, .zo_maxloops = DEFAULT_MAX_LOOPS, /* max loops during spa_freeze() */ .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, .zo_gvars_count = 0, }; extern uint64_t metaslab_force_ganging; extern uint64_t metaslab_df_alloc_threshold; extern unsigned long zfs_deadman_synctime_ms; extern int metaslab_preload_limit; extern int zfs_compressed_arc_enabled; extern int zfs_abd_scatter_enabled; extern int dmu_object_alloc_chunk_shift; extern boolean_t zfs_force_some_double_word_sm_entries; extern unsigned long zio_decompress_fail_fraction; extern unsigned long zfs_reconstruct_indirect_damage_fraction; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; static const char *const ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; typedef struct ztest_shared_ds { uint64_t zd_seq; } ztest_shared_ds_t; static ztest_shared_ds_t *ztest_shared_ds; #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) #define BT_MAGIC 0x123456789abcdefULL #define MAXFAULTS(zs) \ (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) enum ztest_io_type { ZTEST_IO_WRITE_TAG, ZTEST_IO_WRITE_PATTERN, ZTEST_IO_WRITE_ZEROES, ZTEST_IO_TRUNCATE, ZTEST_IO_SETATTR, ZTEST_IO_REWRITE, ZTEST_IO_TYPES }; typedef struct ztest_block_tag { uint64_t bt_magic; uint64_t bt_objset; uint64_t bt_object; uint64_t bt_dnodesize; uint64_t bt_offset; uint64_t bt_gen; uint64_t bt_txg; uint64_t bt_crtxg; } ztest_block_tag_t; typedef struct bufwad { uint64_t bw_index; uint64_t bw_txg; uint64_t bw_data; } bufwad_t; /* * It would be better to use a rangelock_t per object. Unfortunately * the rangelock_t is not a drop-in replacement for rl_t, because we * still need to map from object ID to rangelock_t. */ typedef enum { RL_READER, RL_WRITER, RL_APPEND } rl_type_t; typedef struct rll { void *rll_writer; int rll_readers; kmutex_t rll_lock; kcondvar_t rll_cv; } rll_t; typedef struct rl { uint64_t rl_object; uint64_t rl_offset; uint64_t rl_size; rll_t *rl_lock; } rl_t; #define ZTEST_RANGE_LOCKS 64 #define ZTEST_OBJECT_LOCKS 64 /* * Object descriptor. Used as a template for object lookup/create/remove. */ typedef struct ztest_od { uint64_t od_dir; uint64_t od_object; dmu_object_type_t od_type; dmu_object_type_t od_crtype; uint64_t od_blocksize; uint64_t od_crblocksize; uint64_t od_crdnodesize; uint64_t od_gen; uint64_t od_crgen; char od_name[ZFS_MAX_DATASET_NAME_LEN]; } ztest_od_t; /* * Per-dataset state. */ typedef struct ztest_ds { ztest_shared_ds_t *zd_shared; objset_t *zd_os; pthread_rwlock_t zd_zilog_lock; zilog_t *zd_zilog; ztest_od_t *zd_od; /* debugging aid */ char zd_name[ZFS_MAX_DATASET_NAME_LEN]; kmutex_t zd_dirobj_lock; rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; } ztest_ds_t; /* * Per-iteration state. */ typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); typedef struct ztest_info { ztest_func_t *zi_func; /* test function */ uint64_t zi_iters; /* iterations per execution */ uint64_t *zi_interval; /* execute every seconds */ const char *zi_funcname; /* name of test function */ } ztest_info_t; typedef struct ztest_shared_callstate { uint64_t zc_count; /* per-pass count */ uint64_t zc_time; /* per-pass time */ uint64_t zc_next; /* next time to call this function */ } ztest_shared_callstate_t; static ztest_shared_callstate_t *ztest_shared_callstate; #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) ztest_func_t ztest_dmu_read_write; ztest_func_t ztest_dmu_write_parallel; ztest_func_t ztest_dmu_object_alloc_free; ztest_func_t ztest_dmu_object_next_chunk; ztest_func_t ztest_dmu_commit_callbacks; ztest_func_t ztest_zap; ztest_func_t ztest_zap_parallel; ztest_func_t ztest_zil_commit; ztest_func_t ztest_zil_remount; ztest_func_t ztest_dmu_read_write_zcopy; ztest_func_t ztest_dmu_objset_create_destroy; ztest_func_t ztest_dmu_prealloc; ztest_func_t ztest_fzap; ztest_func_t ztest_dmu_snapshot_create_destroy; ztest_func_t ztest_dsl_prop_get_set; ztest_func_t ztest_spa_prop_get_set; ztest_func_t ztest_spa_create_destroy; ztest_func_t ztest_fault_inject; ztest_func_t ztest_dmu_snapshot_hold; ztest_func_t ztest_mmp_enable_disable; ztest_func_t ztest_scrub; ztest_func_t ztest_dsl_dataset_promote_busy; ztest_func_t ztest_vdev_attach_detach; ztest_func_t ztest_vdev_LUN_growth; ztest_func_t ztest_vdev_add_remove; ztest_func_t ztest_vdev_class_add; ztest_func_t ztest_vdev_aux_add_remove; ztest_func_t ztest_split_pool; ztest_func_t ztest_reguid; ztest_func_t ztest_spa_upgrade; ztest_func_t ztest_device_removal; ztest_func_t ztest_spa_checkpoint_create_discard; ztest_func_t ztest_initialize; ztest_func_t ztest_trim; ztest_func_t ztest_blake3; ztest_func_t ztest_fletcher; ztest_func_t ztest_fletcher_incr; ztest_func_t ztest_verify_dnode_bt; uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ #define ZTI_INIT(func, iters, interval) \ { .zi_func = (func), \ .zi_iters = (iters), \ .zi_interval = (interval), \ .zi_funcname = # func } ztest_info_t ztest_info[] = { ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), ZTI_INIT(ztest_zap, 30, &zopt_always), ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), ZTI_INIT(ztest_split_pool, 1, &zopt_always), ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), #if 0 ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), #endif ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), ZTI_INIT(ztest_reguid, 1, &zopt_rarely), ZTI_INIT(ztest_scrub, 1, &zopt_rarely), ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), ZTI_INIT(ztest_trim, 1, &zopt_sometimes), ZTI_INIT(ztest_blake3, 1, &zopt_rarely), ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) /* * The following struct is used to hold a list of uncalled commit callbacks. * The callbacks are ordered by txg number. */ typedef struct ztest_cb_list { kmutex_t zcl_callbacks_lock; list_t zcl_callbacks; } ztest_cb_list_t; /* * Stuff we need to share writably between parent and child. */ typedef struct ztest_shared { boolean_t zs_do_init; hrtime_t zs_proc_start; hrtime_t zs_proc_stop; hrtime_t zs_thread_start; hrtime_t zs_thread_stop; hrtime_t zs_thread_kill; uint64_t zs_enospc_count; uint64_t zs_vdev_next_leaf; uint64_t zs_vdev_aux; uint64_t zs_alloc; uint64_t zs_space; uint64_t zs_splits; uint64_t zs_mirrors; uint64_t zs_metaslab_sz; uint64_t zs_metaslab_df_alloc_threshold; uint64_t zs_guid; } ztest_shared_t; #define ID_PARALLEL -1ULL static char ztest_dev_template[] = "%s/%s.%llua"; static char ztest_aux_template[] = "%s/%s.%s.%llu"; ztest_shared_t *ztest_shared; static spa_t *ztest_spa = NULL; static ztest_ds_t *ztest_ds; static kmutex_t ztest_vdev_lock; static boolean_t ztest_device_removal_active = B_FALSE; static boolean_t ztest_pool_scrubbed = B_FALSE; static kmutex_t ztest_checkpoint_lock; /* * The ztest_name_lock protects the pool and dataset namespace used by * the individual tests. To modify the namespace, consumers must grab * this lock as writer. Grabbing the lock as reader will ensure that the * namespace does not change while the lock is held. */ static pthread_rwlock_t ztest_name_lock; static boolean_t ztest_dump_core = B_TRUE; static boolean_t ztest_exiting; /* Global commit callback list */ static ztest_cb_list_t zcl; /* Commit cb delay */ static uint64_t zc_min_txg_delay = UINT64_MAX; static int zc_cb_counter = 0; /* * Minimum number of commit callbacks that need to be registered for us to check * whether the minimum txg delay is acceptable. */ #define ZTEST_COMMIT_CB_MIN_REG 100 /* * If a number of txgs equal to this threshold have been created after a commit * callback has been registered but not called, then we assume there is an * implementation bug. */ #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) enum ztest_object { ZTEST_META_DNODE = 0, ZTEST_DIROBJ, ZTEST_OBJECTS }; static __attribute__((noreturn)) void usage(boolean_t requested); static int ztest_scrub_impl(spa_t *spa); /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } static void dump_debug_buffer(void) { ssize_t ret __attribute__((unused)); if (!ztest_opts.zo_dump_dbgmsg) return; /* * We use write() instead of printf() so that this function * is safe to call from a signal handler. */ ret = write(STDOUT_FILENO, "\n", 1); zfs_dbgmsg_print("ztest"); } #define BACKTRACE_SZ 100 static void sig_handler(int signo) { struct sigaction action; #if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */ int nptrs; void *buffer[BACKTRACE_SZ]; nptrs = backtrace(buffer, BACKTRACE_SZ); backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); #endif dump_debug_buffer(); /* * Restore default action and re-raise signal so SIGSEGV and * SIGABRT can trigger a core dump. */ action.sa_handler = SIG_DFL; sigemptyset(&action.sa_mask); action.sa_flags = 0; (void) sigaction(signo, &action, NULL); raise(signo); } #define FATAL_MSG_SZ 1024 static const char *fatal_msg; static __attribute__((format(printf, 2, 3))) __attribute__((noreturn)) void fatal(int do_perror, const char *message, ...) { va_list args; int save_errno = errno; char *buf; (void) fflush(stdout); buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); if (buf == NULL) goto out; va_start(args, message); (void) sprintf(buf, "ztest: "); /* LINTED */ (void) vsprintf(buf + strlen(buf), message, args); va_end(args); if (do_perror) { (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), ": %s", strerror(save_errno)); } (void) fprintf(stderr, "%s\n", buf); fatal_msg = buf; /* to ease debugging */ out: if (ztest_dump_core) abort(); else dump_debug_buffer(); exit(3); } static int str2shift(const char *buf) { const char *ends = "BKMGTPEZ"; int i; if (buf[0] == '\0') return (0); for (i = 0; i < strlen(ends); i++) { if (toupper(buf[0]) == ends[i]) break; } if (i == strlen(ends)) { (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); usage(B_FALSE); } if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { return (10*i); } (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); usage(B_FALSE); } static uint64_t nicenumtoull(const char *buf) { char *end; uint64_t val; val = strtoull(buf, &end, 0); if (end == buf) { (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); usage(B_FALSE); } else if (end[0] == '.') { double fval = strtod(buf, &end); fval *= pow(2, str2shift(end)); /* * UINT64_MAX is not exactly representable as a double. * The closest representation is UINT64_MAX + 1, so we * use a >= comparison instead of > for the bounds check. */ if (fval >= (double)UINT64_MAX) { (void) fprintf(stderr, "ztest: value too large: %s\n", buf); usage(B_FALSE); } val = (uint64_t)fval; } else { int shift = str2shift(end); if (shift >= 64 || (val << shift) >> shift != val) { (void) fprintf(stderr, "ztest: value too large: %s\n", buf); usage(B_FALSE); } val <<= shift; } return (val); } typedef struct ztest_option { const char short_opt; const char *long_opt; const char *long_opt_param; const char *comment; unsigned int default_int; const char *default_str; } ztest_option_t; /* * The following option_table is used for generating the usage info as well as * the long and short option information for calling getopt_long(). */ static ztest_option_t option_table[] = { { 'v', "vdevs", "INTEGER", "Number of vdevs", DEFAULT_VDEV_COUNT, NULL}, { 's', "vdev-size", "INTEGER", "Size of each vdev", NO_DEFAULT, DEFAULT_VDEV_SIZE_STR}, { 'a', "alignment-shift", "INTEGER", "Alignment shift; use 0 for random", DEFAULT_ASHIFT, NULL}, { 'm', "mirror-copies", "INTEGER", "Number of mirror copies", DEFAULT_MIRRORS, NULL}, { 'r', "raid-disks", "INTEGER", "Number of raidz/draid disks", DEFAULT_RAID_CHILDREN, NULL}, { 'R', "raid-parity", "INTEGER", "Raid parity", DEFAULT_RAID_PARITY, NULL}, { 'K', "raid-kind", "raidz|draid|random", "Raid kind", NO_DEFAULT, "random"}, { 'D', "draid-data", "INTEGER", "Number of draid data drives", DEFAULT_DRAID_DATA, NULL}, { 'S', "draid-spares", "INTEGER", "Number of draid spares", DEFAULT_DRAID_SPARES, NULL}, { 'd', "datasets", "INTEGER", "Number of datasets", DEFAULT_DATASETS_COUNT, NULL}, { 't', "threads", "INTEGER", "Number of ztest threads", DEFAULT_THREADS, NULL}, { 'g', "gang-block-threshold", "INTEGER", "Metaslab gang block threshold", NO_DEFAULT, DEFAULT_FORCE_GANGING_STR}, { 'i', "init-count", "INTEGER", "Number of times to initialize pool", DEFAULT_INITS, NULL}, { 'k', "kill-percentage", "INTEGER", "Kill percentage", NO_DEFAULT, DEFAULT_KILLRATE_STR}, { 'p', "pool-name", "STRING", "Pool name", NO_DEFAULT, DEFAULT_POOL}, { 'f', "vdev-file-directory", "PATH", "File directory for vdev files", NO_DEFAULT, DEFAULT_VDEV_DIR}, { 'M', "multi-host", NULL, "Multi-host; simulate pool imported on remote host", NO_DEFAULT, NULL}, { 'E', "use-existing-pool", NULL, "Use existing pool instead of creating new one", NO_DEFAULT, NULL}, { 'T', "run-time", "INTEGER", "Total run time", NO_DEFAULT, DEFAULT_RUN_TIME_STR}, { 'P', "pass-time", "INTEGER", "Time per pass", NO_DEFAULT, DEFAULT_PASS_TIME_STR}, { 'F', "freeze-loops", "INTEGER", "Max loops in spa_freeze()", DEFAULT_MAX_LOOPS, NULL}, { 'B', "alt-ztest", "PATH", "Alternate ztest path", NO_DEFAULT, NULL}, { 'C', "vdev-class-state", "on|off|random", "vdev class state", NO_DEFAULT, "random"}, { 'o', "option", "\"OPTION=INTEGER\"", "Set global variable to an unsigned 32-bit integer value", NO_DEFAULT, NULL}, { 'G', "dump-debug-msg", NULL, "Dump zfs_dbgmsg buffer before exiting due to an error", NO_DEFAULT, NULL}, { 'V', "verbose", NULL, "Verbose (use multiple times for ever more verbosity)", NO_DEFAULT, NULL}, { 'h', "help", NULL, "Show this help", NO_DEFAULT, NULL}, {0, 0, 0, 0, 0, 0} }; static struct option *long_opts = NULL; static char *short_opts = NULL; static void init_options(void) { ASSERT3P(long_opts, ==, NULL); ASSERT3P(short_opts, ==, NULL); int count = sizeof (option_table) / sizeof (option_table[0]); long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); short_opts = umem_alloc(sizeof (char) * 2 * count, UMEM_NOFAIL); int short_opt_index = 0; for (int i = 0; i < count; i++) { long_opts[i].val = option_table[i].short_opt; long_opts[i].name = option_table[i].long_opt; long_opts[i].has_arg = option_table[i].long_opt_param != NULL ? required_argument : no_argument; long_opts[i].flag = NULL; short_opts[short_opt_index++] = option_table[i].short_opt; if (option_table[i].long_opt_param != NULL) { short_opts[short_opt_index++] = ':'; } } } static void fini_options(void) { int count = sizeof (option_table) / sizeof (option_table[0]); umem_free(long_opts, sizeof (struct option) * count); umem_free(short_opts, sizeof (char) * 2 * count); long_opts = NULL; short_opts = NULL; } static __attribute__((noreturn)) void usage(boolean_t requested) { char option[80]; FILE *fp = requested ? stdout : stderr; (void) fprintf(fp, "Usage: %s [OPTIONS...]\n", DEFAULT_POOL); for (int i = 0; option_table[i].short_opt != 0; i++) { if (option_table[i].long_opt_param != NULL) { (void) sprintf(option, " -%c --%s=%s", option_table[i].short_opt, option_table[i].long_opt, option_table[i].long_opt_param); } else { (void) sprintf(option, " -%c --%s", option_table[i].short_opt, option_table[i].long_opt); } (void) fprintf(fp, " %-40s%s", option, option_table[i].comment); if (option_table[i].long_opt_param != NULL) { if (option_table[i].default_str != NULL) { (void) fprintf(fp, " (default: %s)", option_table[i].default_str); } else if (option_table[i].default_int != NO_DEFAULT) { (void) fprintf(fp, " (default: %u)", option_table[i].default_int); } } (void) fprintf(fp, "\n"); } exit(requested ? 0 : 1); } static uint64_t ztest_random(uint64_t range) { uint64_t r; ASSERT3S(ztest_fd_rand, >=, 0); if (range == 0) return (0); if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) fatal(B_TRUE, "short read from /dev/urandom"); return (r % range); } static void ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) { char name[32]; char *value; int state = ZTEST_VDEV_CLASS_RND; (void) strlcpy(name, input, sizeof (name)); value = strchr(name, '='); if (value == NULL) { (void) fprintf(stderr, "missing value in property=value " "'-C' argument (%s)\n", input); usage(B_FALSE); } *(value) = '\0'; value++; if (strcmp(value, "on") == 0) { state = ZTEST_VDEV_CLASS_ON; } else if (strcmp(value, "off") == 0) { state = ZTEST_VDEV_CLASS_OFF; } else if (strcmp(value, "random") == 0) { state = ZTEST_VDEV_CLASS_RND; } else { (void) fprintf(stderr, "invalid property value '%s'\n", value); usage(B_FALSE); } if (strcmp(name, "special") == 0) { zo->zo_special_vdevs = state; } else { (void) fprintf(stderr, "invalid property name '%s'\n", name); usage(B_FALSE); } if (zo->zo_verbose >= 3) (void) printf("%s vdev state is '%s'\n", name, value); } static void process_options(int argc, char **argv) { char *path; ztest_shared_opts_t *zo = &ztest_opts; int opt; uint64_t value; const char *raid_kind = "random"; memcpy(zo, &ztest_opts_defaults, sizeof (*zo)); init_options(); while ((opt = getopt_long(argc, argv, short_opts, long_opts, NULL)) != EOF) { value = 0; switch (opt) { case 'v': case 's': case 'a': case 'm': case 'r': case 'R': case 'D': case 'S': case 'd': case 't': case 'g': case 'i': case 'k': case 'T': case 'P': case 'F': value = nicenumtoull(optarg); } switch (opt) { case 'v': zo->zo_vdevs = value; break; case 's': zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); break; case 'a': zo->zo_ashift = value; break; case 'm': zo->zo_mirrors = value; break; case 'r': zo->zo_raid_children = MAX(1, value); break; case 'R': zo->zo_raid_parity = MIN(MAX(value, 1), 3); break; case 'K': raid_kind = optarg; break; case 'D': zo->zo_draid_data = MAX(1, value); break; case 'S': zo->zo_draid_spares = MAX(1, value); break; case 'd': zo->zo_datasets = MAX(1, value); break; case 't': zo->zo_threads = MAX(1, value); break; case 'g': zo->zo_metaslab_force_ganging = MAX(SPA_MINBLOCKSIZE << 1, value); break; case 'i': zo->zo_init = value; break; case 'k': zo->zo_killrate = value; break; case 'p': (void) strlcpy(zo->zo_pool, optarg, sizeof (zo->zo_pool)); break; case 'f': path = realpath(optarg, NULL); if (path == NULL) { (void) fprintf(stderr, "error: %s: %s\n", optarg, strerror(errno)); usage(B_FALSE); } else { (void) strlcpy(zo->zo_dir, path, sizeof (zo->zo_dir)); free(path); } break; case 'M': zo->zo_mmp_test = 1; break; case 'V': zo->zo_verbose++; break; case 'E': zo->zo_init = 0; break; case 'T': zo->zo_time = value; break; case 'P': zo->zo_passtime = MAX(1, value); break; case 'F': zo->zo_maxloops = MAX(1, value); break; case 'B': (void) strlcpy(zo->zo_alt_ztest, optarg, sizeof (zo->zo_alt_ztest)); break; case 'C': ztest_parse_name_value(optarg, zo); break; case 'o': if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { (void) fprintf(stderr, "max global var count (%zu) exceeded\n", ZO_GVARS_MAX_COUNT); usage(B_FALSE); } char *v = zo->zo_gvars[zo->zo_gvars_count]; if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= ZO_GVARS_MAX_ARGLEN) { (void) fprintf(stderr, "global var option '%s' is too long\n", optarg); usage(B_FALSE); } zo->zo_gvars_count++; break; case 'G': zo->zo_dump_dbgmsg = 1; break; case 'h': usage(B_TRUE); break; case '?': default: usage(B_FALSE); break; } } fini_options(); /* When raid choice is 'random' add a draid pool 50% of the time */ if (strcmp(raid_kind, "random") == 0) { raid_kind = (ztest_random(2) == 0) ? "draid" : "raidz"; if (ztest_opts.zo_verbose >= 3) (void) printf("choosing RAID type '%s'\n", raid_kind); } if (strcmp(raid_kind, "draid") == 0) { uint64_t min_devsize; /* With fewer disk use 256M, otherwise 128M is OK */ min_devsize = (ztest_opts.zo_raid_children < 16) ? (256ULL << 20) : (128ULL << 20); /* No top-level mirrors with dRAID for now */ zo->zo_mirrors = 0; /* Use more appropriate defaults for dRAID */ if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) zo->zo_vdevs = 1; if (zo->zo_raid_children == ztest_opts_defaults.zo_raid_children) zo->zo_raid_children = 16; if (zo->zo_ashift < 12) zo->zo_ashift = 12; if (zo->zo_vdev_size < min_devsize) zo->zo_vdev_size = min_devsize; if (zo->zo_draid_data + zo->zo_raid_parity > zo->zo_raid_children - zo->zo_draid_spares) { (void) fprintf(stderr, "error: too few draid " "children (%d) for stripe width (%d)\n", zo->zo_raid_children, zo->zo_draid_data + zo->zo_raid_parity); usage(B_FALSE); } (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, sizeof (zo->zo_raid_type)); } else /* using raidz */ { ASSERT0(strcmp(raid_kind, "raidz")); zo->zo_raid_parity = MIN(zo->zo_raid_parity, zo->zo_raid_children - 1); } zo->zo_vdevtime = (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : UINT64_MAX >> 2); if (*zo->zo_alt_ztest) { const char *invalid_what = "ztest"; char *val = zo->zo_alt_ztest; if (0 != access(val, X_OK) || (strrchr(val, '/') == NULL && (errno = EINVAL))) goto invalid; int dirlen = strrchr(val, '/') - val; - strncpy(zo->zo_alt_libpath, val, dirlen); + strlcpy(zo->zo_alt_libpath, val, + MIN(sizeof (zo->zo_alt_libpath), dirlen + 1)); invalid_what = "library path", val = zo->zo_alt_libpath; if (strrchr(val, '/') == NULL && (errno = EINVAL)) goto invalid; *strrchr(val, '/') = '\0'; strlcat(val, "/lib", sizeof (zo->zo_alt_libpath)); if (0 != access(zo->zo_alt_libpath, X_OK)) goto invalid; return; invalid: ztest_dump_core = B_FALSE; fatal(B_TRUE, "invalid alternate %s %s", invalid_what, val); } } static void ztest_kill(ztest_shared_t *zs) { zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); /* * Before we kill ourselves, make sure that the config is updated. * See comment above spa_write_cachefile(). */ mutex_enter(&spa_namespace_lock); spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE); mutex_exit(&spa_namespace_lock); (void) raise(SIGKILL); } static void ztest_record_enospc(const char *s) { (void) s; ztest_shared->zs_enospc_count++; } static uint64_t ztest_get_ashift(void) { if (ztest_opts.zo_ashift == 0) return (SPA_MINBLOCKSHIFT + ztest_random(5)); return (ztest_opts.zo_ashift); } static boolean_t ztest_is_draid_spare(const char *name) { uint64_t spare_id = 0, parity = 0, vdev_id = 0; if (sscanf(name, VDEV_TYPE_DRAID "%"PRIu64"-%"PRIu64"-%"PRIu64"", &parity, &vdev_id, &spare_id) == 3) { return (B_TRUE); } return (B_FALSE); } static nvlist_t * make_vdev_file(const char *path, const char *aux, const char *pool, size_t size, uint64_t ashift) { char *pathbuf = NULL; uint64_t vdev; nvlist_t *file; boolean_t draid_spare = B_FALSE; if (ashift == 0) ashift = ztest_get_ashift(); if (path == NULL) { pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); path = pathbuf; if (aux != NULL) { vdev = ztest_shared->zs_vdev_aux; (void) snprintf(pathbuf, MAXPATHLEN, ztest_aux_template, ztest_opts.zo_dir, pool == NULL ? ztest_opts.zo_pool : pool, aux, vdev); } else { vdev = ztest_shared->zs_vdev_next_leaf++; (void) snprintf(pathbuf, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, pool == NULL ? ztest_opts.zo_pool : pool, vdev); } } else { draid_spare = ztest_is_draid_spare(path); } if (size != 0 && !draid_spare) { int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); if (fd == -1) fatal(B_TRUE, "can't open %s", path); if (ftruncate(fd, size) != 0) fatal(B_TRUE, "can't ftruncate %s", path); (void) close(fd); } file = fnvlist_alloc(); fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); umem_free(pathbuf, MAXPATHLEN); return (file); } static nvlist_t * make_vdev_raid(const char *path, const char *aux, const char *pool, size_t size, uint64_t ashift, int r) { nvlist_t *raid, **child; int c; if (r < 2) return (make_vdev_file(path, aux, pool, size, ashift)); child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < r; c++) child[c] = make_vdev_file(path, aux, pool, size, ashift); raid = fnvlist_alloc(); fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, ztest_opts.zo_raid_type); fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, ztest_opts.zo_raid_parity); fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)child, r); if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { uint64_t ndata = ztest_opts.zo_draid_data; uint64_t nparity = ztest_opts.zo_raid_parity; uint64_t nspares = ztest_opts.zo_draid_spares; uint64_t children = ztest_opts.zo_raid_children; uint64_t ngroups = 1; /* * Calculate the minimum number of groups required to fill a * slice. This is the LCM of the stripe width (data + parity) * and the number of data drives (children - spares). */ while (ngroups * (ndata + nparity) % (children - nspares) != 0) ngroups++; /* Store the basic dRAID configuration. */ fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); } for (c = 0; c < r; c++) fnvlist_free(child[c]); umem_free(child, r * sizeof (nvlist_t *)); return (raid); } static nvlist_t * make_vdev_mirror(const char *path, const char *aux, const char *pool, size_t size, uint64_t ashift, int r, int m) { nvlist_t *mirror, **child; int c; if (m < 1) return (make_vdev_raid(path, aux, pool, size, ashift, r)); child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < m; c++) child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); mirror = fnvlist_alloc(); fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)child, m); for (c = 0; c < m; c++) fnvlist_free(child[c]); umem_free(child, m * sizeof (nvlist_t *)); return (mirror); } static nvlist_t * make_vdev_root(const char *path, const char *aux, const char *pool, size_t size, uint64_t ashift, const char *class, int r, int m, int t) { nvlist_t *root, **child; int c; boolean_t log; ASSERT3S(t, >, 0); log = (class != NULL && strcmp(class, "log") == 0); child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < t; c++) { child[c] = make_vdev_mirror(path, aux, pool, size, ashift, r, m); fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); if (class != NULL && class[0] != '\0') { ASSERT(m > 1 || log); /* expecting a mirror */ fnvlist_add_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, class); } } root = fnvlist_alloc(); fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)child, t); for (c = 0; c < t; c++) fnvlist_free(child[c]); umem_free(child, t * sizeof (nvlist_t *)); return (root); } /* * Find a random spa version. Returns back a random spa version in the * range [initial_version, SPA_VERSION_FEATURES]. */ static uint64_t ztest_random_spa_version(uint64_t initial_version) { uint64_t version = initial_version; if (version <= SPA_VERSION_BEFORE_FEATURES) { version = version + ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); } if (version > SPA_VERSION_BEFORE_FEATURES) version = SPA_VERSION_FEATURES; ASSERT(SPA_VERSION_IS_SUPPORTED(version)); return (version); } static int ztest_random_blocksize(void) { ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); /* * Choose a block size >= the ashift. * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. */ int maxbs = SPA_OLD_MAXBLOCKSHIFT; if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) maxbs = 20; uint64_t block_shift = ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); return (1 << (SPA_MINBLOCKSHIFT + block_shift)); } static int ztest_random_dnodesize(void) { int slots; int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; if (max_slots == DNODE_MIN_SLOTS) return (DNODE_MIN_SIZE); /* * Weight the random distribution more heavily toward smaller * dnode sizes since that is more likely to reflect real-world * usage. */ ASSERT3U(max_slots, >, 4); switch (ztest_random(10)) { case 0: slots = 5 + ztest_random(max_slots - 4); break; case 1 ... 4: slots = 2 + ztest_random(3); break; default: slots = 1; break; } return (slots << DNODE_SHIFT); } static int ztest_random_ibshift(void) { return (DN_MIN_INDBLKSHIFT + ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); } static uint64_t ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) { uint64_t top; vdev_t *rvd = spa->spa_root_vdev; vdev_t *tvd; ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); do { top = ztest_random(rvd->vdev_children); tvd = rvd->vdev_child[top]; } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); return (top); } static uint64_t ztest_random_dsl_prop(zfs_prop_t prop) { uint64_t value; do { value = zfs_prop_random_value(prop, ztest_random(-1ULL)); } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); return (value); } static int ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, boolean_t inherit) { const char *propname = zfs_prop_to_name(prop); const char *valname; char *setpoint; uint64_t curval; int error; error = dsl_prop_set_int(osname, propname, (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); if (error == ENOSPC) { ztest_record_enospc(FTAG); return (error); } ASSERT0(error); setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); if (ztest_opts.zo_verbose >= 6) { int err; err = zfs_prop_index_to_string(prop, curval, &valname); if (err) (void) printf("%s %s = %llu at '%s'\n", osname, propname, (unsigned long long)curval, setpoint); else (void) printf("%s %s = %s at '%s'\n", osname, propname, valname, setpoint); } umem_free(setpoint, MAXPATHLEN); return (error); } static int ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) { spa_t *spa = ztest_spa; nvlist_t *props = NULL; int error; props = fnvlist_alloc(); fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); error = spa_prop_set(spa, props); fnvlist_free(props); if (error == ENOSPC) { ztest_record_enospc(FTAG); return (error); } ASSERT0(error); return (error); } static int ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) { int err; char *cp = NULL; char ddname[ZFS_MAX_DATASET_NAME_LEN]; strcpy(ddname, name); cp = strchr(ddname, '@'); if (cp != NULL) *cp = '\0'; err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); while (decrypt && err == EACCES) { dsl_crypto_params_t *dcp; nvlist_t *crypto_args = fnvlist_alloc(); fnvlist_add_uint8_array(crypto_args, "wkeydata", (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, crypto_args, &dcp)); err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); /* * Note: if there was an error loading, the wkey was not * consumed, and needs to be freed. */ dsl_crypto_params_free(dcp, (err != 0)); fnvlist_free(crypto_args); if (err == EINVAL) { /* * We couldn't load a key for this dataset so try * the parent. This loop will eventually hit the * encryption root since ztest only makes clones * as children of their origin datasets. */ cp = strrchr(ddname, '/'); if (cp == NULL) return (err); *cp = '\0'; err = EACCES; continue; } else if (err != 0) { break; } err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); break; } return (err); } static void ztest_rll_init(rll_t *rll) { rll->rll_writer = NULL; rll->rll_readers = 0; mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); } static void ztest_rll_destroy(rll_t *rll) { ASSERT3P(rll->rll_writer, ==, NULL); ASSERT0(rll->rll_readers); mutex_destroy(&rll->rll_lock); cv_destroy(&rll->rll_cv); } static void ztest_rll_lock(rll_t *rll, rl_type_t type) { mutex_enter(&rll->rll_lock); if (type == RL_READER) { while (rll->rll_writer != NULL) (void) cv_wait(&rll->rll_cv, &rll->rll_lock); rll->rll_readers++; } else { while (rll->rll_writer != NULL || rll->rll_readers) (void) cv_wait(&rll->rll_cv, &rll->rll_lock); rll->rll_writer = curthread; } mutex_exit(&rll->rll_lock); } static void ztest_rll_unlock(rll_t *rll) { mutex_enter(&rll->rll_lock); if (rll->rll_writer) { ASSERT0(rll->rll_readers); rll->rll_writer = NULL; } else { ASSERT3S(rll->rll_readers, >, 0); ASSERT3P(rll->rll_writer, ==, NULL); rll->rll_readers--; } if (rll->rll_writer == NULL && rll->rll_readers == 0) cv_broadcast(&rll->rll_cv); mutex_exit(&rll->rll_lock); } static void ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) { rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; ztest_rll_lock(rll, type); } static void ztest_object_unlock(ztest_ds_t *zd, uint64_t object) { rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; ztest_rll_unlock(rll); } static rl_t * ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, rl_type_t type) { uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; rl_t *rl; rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); rl->rl_object = object; rl->rl_offset = offset; rl->rl_size = size; rl->rl_lock = rll; ztest_rll_lock(rll, type); return (rl); } static void ztest_range_unlock(rl_t *rl) { rll_t *rll = rl->rl_lock; ztest_rll_unlock(rll); umem_free(rl, sizeof (*rl)); } static void ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) { zd->zd_os = os; zd->zd_zilog = dmu_objset_zil(os); zd->zd_shared = szd; dmu_objset_name(os, zd->zd_name); int l; if (zd->zd_shared != NULL) zd->zd_shared->zd_seq = 0; VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) ztest_rll_init(&zd->zd_object_lock[l]); for (l = 0; l < ZTEST_RANGE_LOCKS; l++) ztest_rll_init(&zd->zd_range_lock[l]); } static void ztest_zd_fini(ztest_ds_t *zd) { int l; mutex_destroy(&zd->zd_dirobj_lock); (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) ztest_rll_destroy(&zd->zd_object_lock[l]); for (l = 0; l < ZTEST_RANGE_LOCKS; l++) ztest_rll_destroy(&zd->zd_range_lock[l]); } #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) static uint64_t ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) { uint64_t txg; int error; /* * Attempt to assign tx to some transaction group. */ error = dmu_tx_assign(tx, txg_how); if (error) { if (error == ERESTART) { ASSERT3U(txg_how, ==, TXG_NOWAIT); dmu_tx_wait(tx); } else { ASSERT3U(error, ==, ENOSPC); ztest_record_enospc(tag); } dmu_tx_abort(tx); return (0); } txg = dmu_tx_get_txg(tx); ASSERT3U(txg, !=, 0); return (txg); } static void ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) { bt->bt_magic = BT_MAGIC; bt->bt_objset = dmu_objset_id(os); bt->bt_object = object; bt->bt_dnodesize = dnodesize; bt->bt_offset = offset; bt->bt_gen = gen; bt->bt_txg = txg; bt->bt_crtxg = crtxg; } static void ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) { ASSERT3U(bt->bt_magic, ==, BT_MAGIC); ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); ASSERT3U(bt->bt_object, ==, object); ASSERT3U(bt->bt_dnodesize, ==, dnodesize); ASSERT3U(bt->bt_offset, ==, offset); ASSERT3U(bt->bt_gen, <=, gen); ASSERT3U(bt->bt_txg, <=, txg); ASSERT3U(bt->bt_crtxg, ==, crtxg); } static ztest_block_tag_t * ztest_bt_bonus(dmu_buf_t *db) { dmu_object_info_t doi; ztest_block_tag_t *bt; dmu_object_info_from_db(db, &doi); ASSERT3U(doi.doi_bonus_size, <=, db->db_size); ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); return (bt); } /* * Generate a token to fill up unused bonus buffer space. Try to make * it unique to the object, generation, and offset to verify that data * is not getting overwritten by data from other dnodes. */ #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) /* * Fill up the unused bonus buffer region before the block tag with a * verifiable pattern. Filling the whole bonus area with non-zero data * helps ensure that all dnode traversal code properly skips the * interior regions of large dnodes. */ static void ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, objset_t *os, uint64_t gen) { uint64_t *bonusp; ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), gen, bonusp - (uint64_t *)db->db_data); *bonusp = token; } } /* * Verify that the unused area of a bonus buffer is filled with the * expected tokens. */ static void ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, objset_t *os, uint64_t gen) { uint64_t *bonusp; for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), gen, bonusp - (uint64_t *)db->db_data); VERIFY3U(*bonusp, ==, token); } } /* * ZIL logging ops */ #define lrz_type lr_mode #define lrz_blocksize lr_uid #define lrz_ibshift lr_gid #define lrz_bonustype lr_rdev #define lrz_dnodesize lr_crtime[1] static void ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) { char *name = (void *)(lr + 1); /* name follows lr */ size_t namesize = strlen(name) + 1; itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, sizeof (*lr) + namesize - sizeof (lr_t)); zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) { char *name = (void *)(lr + 1); /* name follows lr */ size_t namesize = strlen(name) + 1; itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, sizeof (*lr) + namesize - sizeof (lr_t)); itx->itx_oid = object; zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) { itx_t *itx; itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); if (zil_replaying(zd->zd_zilog, tx)) return; if (lr->lr_length > zil_max_log_data(zd->zd_zilog)) write_state = WR_INDIRECT; itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); if (write_state == WR_COPIED && dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); write_state = WR_NEED_COPY; } itx->itx_private = zd; itx->itx_wr_state = write_state; itx->itx_sync = (ztest_random(8) == 0); memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, sizeof (*lr) - sizeof (lr_t)); zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) { itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, sizeof (*lr) - sizeof (lr_t)); itx->itx_sync = B_FALSE; zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) { itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, sizeof (*lr) - sizeof (lr_t)); itx->itx_sync = B_FALSE; zil_itx_assign(zd->zd_zilog, itx, tx); } /* * ZIL replay ops */ static int ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_create_t *lr = arg2; char *name = (void *)(lr + 1); /* name follows lr */ objset_t *os = zd->zd_os; ztest_block_tag_t *bbt; dmu_buf_t *db; dmu_tx_t *tx; uint64_t txg; int error = 0; int bonuslen; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); ASSERT3S(name[0], !=, '\0'); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); if (lr->lrz_type == DMU_OT_ZAP_OTHER) { dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); } else { dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); } txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) return (ENOSPC); ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); if (lr->lrz_type == DMU_OT_ZAP_OTHER) { if (lr->lr_foid == 0) { lr->lr_foid = zap_create_dnsize(os, lr->lrz_type, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } else { error = zap_create_claim_dnsize(os, lr->lr_foid, lr->lrz_type, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } } else { if (lr->lr_foid == 0) { lr->lr_foid = dmu_object_alloc_dnsize(os, lr->lrz_type, 0, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } else { error = dmu_object_claim_dnsize(os, lr->lr_foid, lr->lrz_type, 0, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } } if (error) { ASSERT3U(error, ==, EEXIST); ASSERT(zd->zd_zilog->zl_replay); dmu_tx_commit(tx); return (error); } ASSERT3U(lr->lr_foid, !=, 0); if (lr->lrz_type != DMU_OT_ZAP_OTHER) VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, lr->lrz_blocksize, lr->lrz_ibshift, tx)); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); bbt = ztest_bt_bonus(db); dmu_buf_will_dirty(db, tx); ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, lr->lr_gen, txg, txg); ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); dmu_buf_rele(db, FTAG); VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, &lr->lr_foid, tx)); (void) ztest_log_create(zd, tx, lr); dmu_tx_commit(tx); return (0); } static int ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_remove_t *lr = arg2; char *name = (void *)(lr + 1); /* name follows lr */ objset_t *os = zd->zd_os; dmu_object_info_t doi; dmu_tx_t *tx; uint64_t object, txg; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); ASSERT3S(name[0], !=, '\0'); VERIFY0( zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); ASSERT3U(object, !=, 0); ztest_object_lock(zd, object, RL_WRITER); VERIFY0(dmu_object_info(os, object, &doi)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { ztest_object_unlock(zd, object); return (ENOSPC); } if (doi.doi_type == DMU_OT_ZAP_OTHER) { VERIFY0(zap_destroy(os, object, tx)); } else { VERIFY0(dmu_object_free(os, object, tx)); } VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); (void) ztest_log_remove(zd, tx, lr, object); dmu_tx_commit(tx); ztest_object_unlock(zd, object); return (0); } static int ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_write_t *lr = arg2; objset_t *os = zd->zd_os; void *data = lr + 1; /* data follows lr */ uint64_t offset, length; ztest_block_tag_t *bt = data; ztest_block_tag_t *bbt; uint64_t gen, txg, lrtxg, crtxg; dmu_object_info_t doi; dmu_tx_t *tx; dmu_buf_t *db; arc_buf_t *abuf = NULL; rl_t *rl; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); if (length < blocksize) { offset -= offset % blocksize; length = blocksize; } } if (bt->bt_magic == BSWAP_64(BT_MAGIC)) byteswap_uint64_array(bt, sizeof (*bt)); if (bt->bt_magic != BT_MAGIC) bt = NULL; ztest_object_lock(zd, lr->lr_foid, RL_READER); rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); dmu_object_info_from_db(db, &doi); bbt = ztest_bt_bonus(db); ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); gen = bbt->bt_gen; crtxg = bbt->bt_crtxg; lrtxg = lr->lr_common.lrc_txg; tx = dmu_tx_create(os); dmu_tx_hold_write(tx, lr->lr_foid, offset, length); if (ztest_random(8) == 0 && length == doi.doi_data_block_size && P2PHASE(offset, length) == 0) abuf = dmu_request_arcbuf(db, length); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { if (abuf != NULL) dmu_return_arcbuf(abuf); dmu_buf_rele(db, FTAG); ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (ENOSPC); } if (bt != NULL) { /* * Usually, verify the old data before writing new data -- * but not always, because we also want to verify correct * behavior when the data was not recently read into cache. */ ASSERT0(offset % doi.doi_data_block_size); if (ztest_random(4) != 0) { int prefetch = ztest_random(2) ? DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; ztest_block_tag_t rbt; VERIFY(dmu_read(os, lr->lr_foid, offset, sizeof (rbt), &rbt, prefetch) == 0); if (rbt.bt_magic == BT_MAGIC) { ztest_bt_verify(&rbt, os, lr->lr_foid, 0, offset, gen, txg, crtxg); } } /* * Writes can appear to be newer than the bonus buffer because * the ztest_get_data() callback does a dmu_read() of the * open-context data, which may be different than the data * as it was when the write was generated. */ if (zd->zd_zilog->zl_replay) { ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, MAX(gen, bt->bt_gen), MAX(txg, lrtxg), bt->bt_crtxg); } /* * Set the bt's gen/txg to the bonus buffer's gen/txg * so that all of the usual ASSERTs will work. */ ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, crtxg); } if (abuf == NULL) { dmu_write(os, lr->lr_foid, offset, length, data, tx); } else { memcpy(abuf->b_data, data, length); VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx)); } (void) ztest_log_write(zd, tx, lr); dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (0); } static int ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_truncate_t *lr = arg2; objset_t *os = zd->zd_os; dmu_tx_t *tx; uint64_t txg; rl_t *rl; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ztest_object_lock(zd, lr->lr_foid, RL_READER); rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, RL_WRITER); tx = dmu_tx_create(os); dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (ENOSPC); } VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, lr->lr_length, tx)); (void) ztest_log_truncate(zd, tx, lr); dmu_tx_commit(tx); ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (0); } static int ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_setattr_t *lr = arg2; objset_t *os = zd->zd_os; dmu_tx_t *tx; dmu_buf_t *db; ztest_block_tag_t *bbt; uint64_t txg, lrtxg, crtxg, dnodesize; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ztest_object_lock(zd, lr->lr_foid, RL_WRITER); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, lr->lr_foid); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, lr->lr_foid); return (ENOSPC); } bbt = ztest_bt_bonus(db); ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); crtxg = bbt->bt_crtxg; lrtxg = lr->lr_common.lrc_txg; dnodesize = bbt->bt_dnodesize; if (zd->zd_zilog->zl_replay) { ASSERT3U(lr->lr_size, !=, 0); ASSERT3U(lr->lr_mode, !=, 0); ASSERT3U(lrtxg, !=, 0); } else { /* * Randomly change the size and increment the generation. */ lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * sizeof (*bbt); lr->lr_mode = bbt->bt_gen + 1; ASSERT0(lrtxg); } /* * Verify that the current bonus buffer is not newer than our txg. */ ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, MAX(txg, lrtxg), crtxg); dmu_buf_will_dirty(db, tx); ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); ASSERT3U(lr->lr_size, <=, db->db_size); VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); bbt = ztest_bt_bonus(db); ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, txg, crtxg); ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); dmu_buf_rele(db, FTAG); (void) ztest_log_setattr(zd, tx, lr); dmu_tx_commit(tx); ztest_object_unlock(zd, lr->lr_foid); return (0); } zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { NULL, /* 0 no such transaction type */ ztest_replay_create, /* TX_CREATE */ NULL, /* TX_MKDIR */ NULL, /* TX_MKXATTR */ NULL, /* TX_SYMLINK */ ztest_replay_remove, /* TX_REMOVE */ NULL, /* TX_RMDIR */ NULL, /* TX_LINK */ NULL, /* TX_RENAME */ ztest_replay_write, /* TX_WRITE */ ztest_replay_truncate, /* TX_TRUNCATE */ ztest_replay_setattr, /* TX_SETATTR */ NULL, /* TX_ACL */ NULL, /* TX_CREATE_ACL */ NULL, /* TX_CREATE_ATTR */ NULL, /* TX_CREATE_ACL_ATTR */ NULL, /* TX_MKDIR_ACL */ NULL, /* TX_MKDIR_ATTR */ NULL, /* TX_MKDIR_ACL_ATTR */ NULL, /* TX_WRITE2 */ NULL, /* TX_SETSAXATTR */ }; /* * ZIL get_data callbacks */ static void ztest_get_done(zgd_t *zgd, int error) { (void) error; ztest_ds_t *zd = zgd->zgd_private; uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); ztest_range_unlock((rl_t *)zgd->zgd_lr); ztest_object_unlock(zd, object); umem_free(zgd, sizeof (*zgd)); } static int ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { (void) arg2; ztest_ds_t *zd = arg; objset_t *os = zd->zd_os; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; uint64_t txg = lr->lr_common.lrc_txg; uint64_t crtxg; dmu_object_info_t doi; dmu_buf_t *db; zgd_t *zgd; int error; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); ztest_object_lock(zd, object, RL_READER); error = dmu_bonus_hold(os, object, FTAG, &db); if (error) { ztest_object_unlock(zd, object); return (error); } crtxg = ztest_bt_bonus(db)->bt_crtxg; if (crtxg == 0 || crtxg > txg) { dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, object); return (ENOENT); } dmu_object_info_from_db(db, &doi); dmu_buf_rele(db, FTAG); db = NULL; zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); zgd->zgd_lwb = lwb; zgd->zgd_private = zd; if (buf != NULL) { /* immediate write */ zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, object, offset, size, RL_READER); error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); ASSERT0(error); } else { size = doi.doi_data_block_size; if (ISP2(size)) { offset = P2ALIGN(offset, size); } else { ASSERT3U(offset, <, size); offset = 0; } zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, object, offset, size, RL_READER); error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT3U(db->db_offset, ==, offset); ASSERT3U(db->db_size, ==, size); error = dmu_sync(zio, lr->lr_common.lrc_txg, ztest_get_done, zgd); if (error == 0) return (0); } } ztest_get_done(zgd, error); return (error); } static void * ztest_lr_alloc(size_t lrsize, char *name) { char *lr; size_t namesize = name ? strlen(name) + 1 : 0; lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); if (name) memcpy(lr + lrsize, name, namesize); return (lr); } static void ztest_lr_free(void *lr, size_t lrsize, char *name) { size_t namesize = name ? strlen(name) + 1 : 0; umem_free(lr, lrsize + namesize); } /* * Lookup a bunch of objects. Returns the number of objects not found. */ static int ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) { int missing = 0; int error; int i; ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); for (i = 0; i < count; i++, od++) { od->od_object = 0; error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, sizeof (uint64_t), 1, &od->od_object); if (error) { ASSERT3S(error, ==, ENOENT); ASSERT0(od->od_object); missing++; } else { dmu_buf_t *db; ztest_block_tag_t *bbt; dmu_object_info_t doi; ASSERT3U(od->od_object, !=, 0); ASSERT0(missing); /* there should be no gaps */ ztest_object_lock(zd, od->od_object, RL_READER); VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, FTAG, &db)); dmu_object_info_from_db(db, &doi); bbt = ztest_bt_bonus(db); ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); od->od_type = doi.doi_type; od->od_blocksize = doi.doi_data_block_size; od->od_gen = bbt->bt_gen; dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, od->od_object); } } return (missing); } static int ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) { int missing = 0; int i; ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); for (i = 0; i < count; i++, od++) { if (missing) { od->od_object = 0; missing++; continue; } lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); lr->lr_doid = od->od_dir; lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ lr->lrz_type = od->od_crtype; lr->lrz_blocksize = od->od_crblocksize; lr->lrz_ibshift = ztest_random_ibshift(); lr->lrz_bonustype = DMU_OT_UINT64_OTHER; lr->lrz_dnodesize = od->od_crdnodesize; lr->lr_gen = od->od_crgen; lr->lr_crtime[0] = time(NULL); if (ztest_replay_create(zd, lr, B_FALSE) != 0) { ASSERT0(missing); od->od_object = 0; missing++; } else { od->od_object = lr->lr_foid; od->od_type = od->od_crtype; od->od_blocksize = od->od_crblocksize; od->od_gen = od->od_crgen; ASSERT3U(od->od_object, !=, 0); } ztest_lr_free(lr, sizeof (*lr), od->od_name); } return (missing); } static int ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) { int missing = 0; int error; int i; ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); od += count - 1; for (i = count - 1; i >= 0; i--, od--) { if (missing) { missing++; continue; } /* * No object was found. */ if (od->od_object == 0) continue; lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); lr->lr_doid = od->od_dir; if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { ASSERT3U(error, ==, ENOSPC); missing++; } else { od->od_object = 0; } ztest_lr_free(lr, sizeof (*lr), od->od_name); } return (missing); } static int ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, void *data) { lr_write_t *lr; int error; lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); lr->lr_foid = object; lr->lr_offset = offset; lr->lr_length = size; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); memcpy(lr + 1, data, size); error = ztest_replay_write(zd, lr, B_FALSE); ztest_lr_free(lr, sizeof (*lr) + size, NULL); return (error); } static int ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) { lr_truncate_t *lr; int error; lr = ztest_lr_alloc(sizeof (*lr), NULL); lr->lr_foid = object; lr->lr_offset = offset; lr->lr_length = size; error = ztest_replay_truncate(zd, lr, B_FALSE); ztest_lr_free(lr, sizeof (*lr), NULL); return (error); } static int ztest_setattr(ztest_ds_t *zd, uint64_t object) { lr_setattr_t *lr; int error; lr = ztest_lr_alloc(sizeof (*lr), NULL); lr->lr_foid = object; lr->lr_size = 0; lr->lr_mode = 0; error = ztest_replay_setattr(zd, lr, B_FALSE); ztest_lr_free(lr, sizeof (*lr), NULL); return (error); } static void ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) { objset_t *os = zd->zd_os; dmu_tx_t *tx; uint64_t txg; rl_t *rl; txg_wait_synced(dmu_objset_pool(os), 0); ztest_object_lock(zd, object, RL_READER); rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); tx = dmu_tx_create(os); dmu_tx_hold_write(tx, object, offset, size); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg != 0) { dmu_prealloc(os, object, offset, size, tx); dmu_tx_commit(tx); txg_wait_synced(dmu_objset_pool(os), txg); } else { (void) dmu_free_long_range(os, object, offset, size); } ztest_range_unlock(rl); ztest_object_unlock(zd, object); } static void ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) { int err; ztest_block_tag_t wbt; dmu_object_info_t doi; enum ztest_io_type io_type; uint64_t blocksize; void *data; VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); blocksize = doi.doi_data_block_size; data = umem_alloc(blocksize, UMEM_NOFAIL); /* * Pick an i/o type at random, biased toward writing block tags. */ io_type = ztest_random(ZTEST_IO_TYPES); if (ztest_random(2) == 0) io_type = ZTEST_IO_WRITE_TAG; (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); switch (io_type) { case ZTEST_IO_WRITE_TAG: ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, offset, 0, 0, 0); (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); break; case ZTEST_IO_WRITE_PATTERN: (void) memset(data, 'a' + (object + offset) % 5, blocksize); if (ztest_random(2) == 0) { /* * Induce fletcher2 collisions to ensure that * zio_ddt_collision() detects and resolves them * when using fletcher2-verify for deduplication. */ ((uint64_t *)data)[0] ^= 1ULL << 63; ((uint64_t *)data)[4] ^= 1ULL << 63; } (void) ztest_write(zd, object, offset, blocksize, data); break; case ZTEST_IO_WRITE_ZEROES: memset(data, 0, blocksize); (void) ztest_write(zd, object, offset, blocksize, data); break; case ZTEST_IO_TRUNCATE: (void) ztest_truncate(zd, object, offset, blocksize); break; case ZTEST_IO_SETATTR: (void) ztest_setattr(zd, object); break; default: break; case ZTEST_IO_REWRITE: (void) pthread_rwlock_rdlock(&ztest_name_lock); err = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), B_FALSE); VERIFY(err == 0 || err == ENOSPC); err = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COMPRESSION, ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), B_FALSE); VERIFY(err == 0 || err == ENOSPC); (void) pthread_rwlock_unlock(&ztest_name_lock); VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, DMU_READ_NO_PREFETCH)); (void) ztest_write(zd, object, offset, blocksize, data); break; } (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); umem_free(data, blocksize); } /* * Initialize an object description template. */ static void ztest_od_init(ztest_od_t *od, uint64_t id, const char *tag, uint64_t index, dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, uint64_t gen) { od->od_dir = ZTEST_DIROBJ; od->od_object = 0; od->od_crtype = type; od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); od->od_crgen = gen; od->od_type = DMU_OT_NONE; od->od_blocksize = 0; od->od_gen = 0; (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%"PRId64")[%"PRIu64"]", tag, id, index); } /* * Lookup or create the objects for a test using the od template. * If the objects do not all exist, or if 'remove' is specified, * remove any existing objects and create new ones. Otherwise, * use the existing objects. */ static int ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) { int count = size / sizeof (*od); int rv = 0; mutex_enter(&zd->zd_dirobj_lock); if ((ztest_lookup(zd, od, count) != 0 || remove) && (ztest_remove(zd, od, count) != 0 || ztest_create(zd, od, count) != 0)) rv = -1; zd->zd_od = od; mutex_exit(&zd->zd_dirobj_lock); return (rv); } void ztest_zil_commit(ztest_ds_t *zd, uint64_t id) { (void) id; zilog_t *zilog = zd->zd_zilog; (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); /* * Remember the committed values in zd, which is in parent/child * shared memory. If we die, the next iteration of ztest_run() * will verify that the log really does contain this record. */ mutex_enter(&zilog->zl_lock); ASSERT3P(zd->zd_shared, !=, NULL); ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; mutex_exit(&zilog->zl_lock); (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); } /* * This function is designed to simulate the operations that occur during a * mount/unmount operation. We hold the dataset across these operations in an * attempt to expose any implicit assumptions about ZIL management. */ void ztest_zil_remount(ztest_ds_t *zd, uint64_t id) { (void) id; objset_t *os = zd->zd_os; /* * We hold the ztest_vdev_lock so we don't cause problems with * other threads that wish to remove a log device, such as * ztest_device_removal(). */ mutex_enter(&ztest_vdev_lock); /* * We grab the zd_dirobj_lock to ensure that no other thread is * updating the zil (i.e. adding in-memory log records) and the * zd_zilog_lock to block any I/O. */ mutex_enter(&zd->zd_dirobj_lock); (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); /* zfsvfs_teardown() */ zil_close(zd->zd_zilog); /* zfsvfs_setup() */ VERIFY3P(zil_open(os, ztest_get_data, NULL), ==, zd->zd_zilog); zil_replay(os, zd, ztest_replay_vector); (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); mutex_exit(&zd->zd_dirobj_lock); mutex_exit(&ztest_vdev_lock); } /* * Verify that we can't destroy an active pool, create an existing pool, * or create a pool with a bad vdev spec. */ void ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; ztest_shared_opts_t *zo = &ztest_opts; spa_t *spa; nvlist_t *nvroot; if (zo->zo_mmp_test) return; /* * Attempt to create using a bad file. */ nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); VERIFY3U(ENOENT, ==, spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); fnvlist_free(nvroot); /* * Attempt to create using a bad mirror. */ nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); VERIFY3U(ENOENT, ==, spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); fnvlist_free(nvroot); /* * Attempt to create an existing pool. It shouldn't matter * what's in the nvroot; we should fail with EEXIST. */ (void) pthread_rwlock_rdlock(&ztest_name_lock); nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); fnvlist_free(nvroot); /* * We open a reference to the spa and then we try to export it * expecting one of the following errors: * * EBUSY * Because of the reference we just opened. * * ZFS_ERR_EXPORT_IN_PROGRESS * For the case that there is another ztest thread doing * an export concurrently. */ VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); int error = spa_destroy(zo->zo_pool); if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { fatal(B_FALSE, "spa_destroy(%s) returned unexpected value %d", spa->spa_name, error); } spa_close(spa, FTAG); (void) pthread_rwlock_unlock(&ztest_name_lock); } /* * Start and then stop the MMP threads to ensure the startup and shutdown code * works properly. Actual protection and property-related code tested via ZTS. */ void ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; ztest_shared_opts_t *zo = &ztest_opts; spa_t *spa = ztest_spa; if (zo->zo_mmp_test) return; /* * Since enabling MMP involves setting a property, it could not be done * while the pool is suspended. */ if (spa_suspended(spa)) return; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); mutex_enter(&spa->spa_props_lock); zfs_multihost_fail_intervals = 0; if (!spa_multihost(spa)) { spa->spa_multihost = B_TRUE; mmp_thread_start(spa); } mutex_exit(&spa->spa_props_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); txg_wait_synced(spa_get_dsl(spa), 0); mmp_signal_all_threads(); txg_wait_synced(spa_get_dsl(spa), 0); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); mutex_enter(&spa->spa_props_lock); if (spa_multihost(spa)) { mmp_thread_stop(spa); spa->spa_multihost = B_FALSE; } mutex_exit(&spa->spa_props_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); } void ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa; uint64_t initial_version = SPA_VERSION_INITIAL; uint64_t version, newversion; nvlist_t *nvroot, *props; char *name; if (ztest_opts.zo_mmp_test) return; /* dRAID added after feature flags, skip upgrade test. */ if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) return; mutex_enter(&ztest_vdev_lock); name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); /* * Clean up from previous runs. */ (void) spa_destroy(name); nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1); /* * If we're configuring a RAIDZ device then make sure that the * initial version is capable of supporting that feature. */ switch (ztest_opts.zo_raid_parity) { case 0: case 1: initial_version = SPA_VERSION_INITIAL; break; case 2: initial_version = SPA_VERSION_RAIDZ2; break; case 3: initial_version = SPA_VERSION_RAIDZ3; break; } /* * Create a pool with a spa version that can be upgraded. Pick * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. */ do { version = ztest_random_spa_version(initial_version); } while (version > SPA_VERSION_BEFORE_FEATURES); props = fnvlist_alloc(); fnvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), version); VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); fnvlist_free(nvroot); fnvlist_free(props); VERIFY0(spa_open(name, &spa, FTAG)); VERIFY3U(spa_version(spa), ==, version); newversion = ztest_random_spa_version(version + 1); if (ztest_opts.zo_verbose >= 4) { (void) printf("upgrading spa version from " "%"PRIu64" to %"PRIu64"\n", version, newversion); } spa_upgrade(spa, newversion); VERIFY3U(spa_version(spa), >, version); VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, zpool_prop_to_name(ZPOOL_PROP_VERSION))); spa_close(spa, FTAG); kmem_strfree(name); mutex_exit(&ztest_vdev_lock); } static void ztest_spa_checkpoint(spa_t *spa) { ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); int error = spa_checkpoint(spa->spa_name); switch (error) { case 0: case ZFS_ERR_DEVRM_IN_PROGRESS: case ZFS_ERR_DISCARDING_CHECKPOINT: case ZFS_ERR_CHECKPOINT_EXISTS: break; case ENOSPC: ztest_record_enospc(FTAG); break; default: fatal(B_FALSE, "spa_checkpoint(%s) = %d", spa->spa_name, error); } } static void ztest_spa_discard_checkpoint(spa_t *spa) { ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); int error = spa_checkpoint_discard(spa->spa_name); switch (error) { case 0: case ZFS_ERR_DISCARDING_CHECKPOINT: case ZFS_ERR_NO_CHECKPOINT: break; default: fatal(B_FALSE, "spa_discard_checkpoint(%s) = %d", spa->spa_name, error); } } void ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa = ztest_spa; mutex_enter(&ztest_checkpoint_lock); if (ztest_random(2) == 0) { ztest_spa_checkpoint(spa); } else { ztest_spa_discard_checkpoint(spa); } mutex_exit(&ztest_checkpoint_lock); } static vdev_t * vdev_lookup_by_path(vdev_t *vd, const char *path) { vdev_t *mvd; int c; if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) return (vd); for (c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != NULL) return (mvd); return (NULL); } static int spa_num_top_vdevs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); return (rvd->vdev_children); } /* * Verify that vdev_add() works as expected. */ void ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; uint64_t leaves; uint64_t guid; nvlist_t *nvroot; int error; if (ztest_opts.zo_mmp_test) return; mutex_enter(&ztest_vdev_lock); leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; /* * If we have slogs then remove them 1/4 of the time. */ if (spa_has_slogs(spa) && ztest_random(4) == 0) { metaslab_group_t *mg; /* * find the first real slog in log allocation class */ mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; while (!mg->mg_vd->vdev_islog) mg = mg->mg_next; guid = mg->mg_vd->vdev_guid; spa_config_exit(spa, SCL_VDEV, FTAG); /* * We have to grab the zs_name_lock as writer to * prevent a race between removing a slog (dmu_objset_find) * and destroying a dataset. Removing the slog will * grab a reference on the dataset which may cause * dsl_destroy_head() to fail with EBUSY thus * leaving the dataset in an inconsistent state. */ pthread_rwlock_wrlock(&ztest_name_lock); error = spa_vdev_remove(spa, guid, B_FALSE); pthread_rwlock_unlock(&ztest_name_lock); switch (error) { case 0: case EEXIST: /* Generic zil_reset() error */ case EBUSY: /* Replay required */ case EACCES: /* Crypto key not loaded */ case ZFS_ERR_CHECKPOINT_EXISTS: case ZFS_ERR_DISCARDING_CHECKPOINT: break; default: fatal(B_FALSE, "spa_vdev_remove() = %d", error); } } else { spa_config_exit(spa, SCL_VDEV, FTAG); /* * Make 1/4 of the devices be log devices */ nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); fnvlist_free(nvroot); switch (error) { case 0: break; case ENOSPC: ztest_record_enospc("spa_vdev_add"); break; default: fatal(B_FALSE, "spa_vdev_add() = %d", error); } } mutex_exit(&ztest_vdev_lock); } void ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; uint64_t leaves; nvlist_t *nvroot; const char *class = (ztest_random(2) == 0) ? VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; int error; /* * By default add a special vdev 50% of the time */ if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && ztest_random(2) == 0)) { return; } mutex_enter(&ztest_vdev_lock); /* Only test with mirrors */ if (zs->zs_mirrors < 2) { mutex_exit(&ztest_vdev_lock); return; } /* requires feature@allocation_classes */ if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { mutex_exit(&ztest_vdev_lock); return; } leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; spa_config_exit(spa, SCL_VDEV, FTAG); nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); fnvlist_free(nvroot); if (error == ENOSPC) ztest_record_enospc("spa_vdev_add"); else if (error != 0) fatal(B_FALSE, "spa_vdev_add() = %d", error); /* * 50% of the time allow small blocks in the special class */ if (error == 0 && spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { if (ztest_opts.zo_verbose >= 3) (void) printf("Enabling special VDEV small blocks\n"); (void) ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); } mutex_exit(&ztest_vdev_lock); if (ztest_opts.zo_verbose >= 3) { metaslab_class_t *mc; if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) mc = spa_special_class(spa); else mc = spa_dedup_class(spa); (void) printf("Added a %s mirrored vdev (of %d)\n", class, (int)mc->mc_groups); } } /* * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. */ void ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; vdev_t *rvd = spa->spa_root_vdev; spa_aux_vdev_t *sav; const char *aux; char *path; uint64_t guid = 0; int error, ignore_err = 0; if (ztest_opts.zo_mmp_test) return; path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); if (ztest_random(2) == 0) { sav = &spa->spa_spares; aux = ZPOOL_CONFIG_SPARES; } else { sav = &spa->spa_l2cache; aux = ZPOOL_CONFIG_L2CACHE; } mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); if (sav->sav_count != 0 && ztest_random(4) == 0) { /* * Pick a random device to remove. */ vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) ignore_err = ENOTSUP; guid = svd->vdev_guid; } else { /* * Find an unused device we can add. */ zs->zs_vdev_aux = 0; for (;;) { int c; (void) snprintf(path, MAXPATHLEN, ztest_aux_template, ztest_opts.zo_dir, ztest_opts.zo_pool, aux, zs->zs_vdev_aux); for (c = 0; c < sav->sav_count; c++) if (strcmp(sav->sav_vdevs[c]->vdev_path, path) == 0) break; if (c == sav->sav_count && vdev_lookup_by_path(rvd, path) == NULL) break; zs->zs_vdev_aux++; } } spa_config_exit(spa, SCL_VDEV, FTAG); if (guid == 0) { /* * Add a new device. */ nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); error = spa_vdev_add(spa, nvroot); switch (error) { case 0: break; default: fatal(B_FALSE, "spa_vdev_add(%p) = %d", nvroot, error); } fnvlist_free(nvroot); } else { /* * Remove an existing device. Sometimes, dirty its * vdev state first to make sure we handle removal * of devices that have pending state changes. */ if (ztest_random(2) == 0) (void) vdev_online(spa, guid, 0, NULL); error = spa_vdev_remove(spa, guid, B_FALSE); switch (error) { case 0: case EBUSY: case ZFS_ERR_CHECKPOINT_EXISTS: case ZFS_ERR_DISCARDING_CHECKPOINT: break; default: if (error != ignore_err) fatal(B_FALSE, "spa_vdev_remove(%"PRIu64") = %d", guid, error); } } mutex_exit(&ztest_vdev_lock); umem_free(path, MAXPATHLEN); } /* * split a pool if it has mirror tlvdevs */ void ztest_split_pool(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; vdev_t *rvd = spa->spa_root_vdev; nvlist_t *tree, **child, *config, *split, **schild; uint_t c, children, schildren = 0, lastlogid = 0; int error = 0; if (ztest_opts.zo_mmp_test) return; mutex_enter(&ztest_vdev_lock); /* ensure we have a usable config; mirrors of raidz aren't supported */ if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { mutex_exit(&ztest_vdev_lock); return; } /* clean up the old pool, if any */ (void) spa_destroy("splitp"); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* generate a config from the existing config */ mutex_enter(&spa->spa_props_lock); tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); mutex_exit(&spa->spa_props_lock); VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, &children)); schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); for (c = 0; c < children; c++) { vdev_t *tvd = rvd->vdev_child[c]; nvlist_t **mchild; uint_t mchildren; if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { schild[schildren] = fnvlist_alloc(); fnvlist_add_string(schild[schildren], ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); fnvlist_add_uint64(schild[schildren], ZPOOL_CONFIG_IS_HOLE, 1); if (lastlogid == 0) lastlogid = schildren; ++schildren; continue; } lastlogid = 0; VERIFY0(nvlist_lookup_nvlist_array(child[c], ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); schild[schildren++] = fnvlist_dup(mchild[0]); } /* OK, create a config that can be used to split */ split = fnvlist_alloc(); fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)schild, lastlogid != 0 ? lastlogid : schildren); config = fnvlist_alloc(); fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); for (c = 0; c < schildren; c++) fnvlist_free(schild[c]); free(schild); fnvlist_free(split); spa_config_exit(spa, SCL_VDEV, FTAG); (void) pthread_rwlock_wrlock(&ztest_name_lock); error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); (void) pthread_rwlock_unlock(&ztest_name_lock); fnvlist_free(config); if (error == 0) { (void) printf("successful split - results:\n"); mutex_enter(&spa_namespace_lock); show_pool_stats(spa); show_pool_stats(spa_lookup("splitp")); mutex_exit(&spa_namespace_lock); ++zs->zs_splits; --zs->zs_mirrors; } mutex_exit(&ztest_vdev_lock); } /* * Verify that we can attach and detach devices. */ void ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; spa_aux_vdev_t *sav = &spa->spa_spares; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *pvd; nvlist_t *root; uint64_t leaves; uint64_t leaf, top; uint64_t ashift = ztest_get_ashift(); uint64_t oldguid, pguid; uint64_t oldsize, newsize; char *oldpath, *newpath; int replacing; int oldvd_has_siblings = B_FALSE; int newvd_is_spare = B_FALSE; int newvd_is_dspare = B_FALSE; int oldvd_is_log; int error, expected_error; if (ztest_opts.zo_mmp_test) return; oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); mutex_enter(&ztest_vdev_lock); leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * If a vdev is in the process of being removed, its removal may * finish while we are in progress, leading to an unexpected error * value. Don't bother trying to attach while we are in the middle * of removal. */ if (ztest_device_removal_active) { spa_config_exit(spa, SCL_ALL, FTAG); goto out; } /* * Decide whether to do an attach or a replace. */ replacing = ztest_random(2); /* * Pick a random top-level vdev. */ top = ztest_random_vdev_top(spa, B_TRUE); /* * Pick a random leaf within it. */ leaf = ztest_random(leaves); /* * Locate this vdev. */ oldvd = rvd->vdev_child[top]; /* pick a child from the mirror */ if (zs->zs_mirrors >= 1) { ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children]; } /* pick a child out of the raidz group */ if (ztest_opts.zo_raid_children > 1) { if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); else ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); ASSERT3U(oldvd->vdev_children, ==, ztest_opts.zo_raid_children); oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children]; } /* * If we're already doing an attach or replace, oldvd may be a * mirror vdev -- in which case, pick a random child. */ while (oldvd->vdev_children != 0) { oldvd_has_siblings = B_TRUE; ASSERT3U(oldvd->vdev_children, >=, 2); oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; } oldguid = oldvd->vdev_guid; oldsize = vdev_get_min_asize(oldvd); oldvd_is_log = oldvd->vdev_top->vdev_islog; (void) strcpy(oldpath, oldvd->vdev_path); pvd = oldvd->vdev_parent; pguid = pvd->vdev_guid; /* * If oldvd has siblings, then half of the time, detach it. Prior * to the detach the pool is scrubbed in order to prevent creating * unrepairable blocks as a result of the data corruption injection. */ if (oldvd_has_siblings && ztest_random(2) == 0) { spa_config_exit(spa, SCL_ALL, FTAG); error = ztest_scrub_impl(spa); if (error) goto out; error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); if (error != 0 && error != ENODEV && error != EBUSY && error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && error != ZFS_ERR_DISCARDING_CHECKPOINT) fatal(B_FALSE, "detach (%s) returned %d", oldpath, error); goto out; } /* * For the new vdev, choose with equal probability between the two * standard paths (ending in either 'a' or 'b') or a random hot spare. */ if (sav->sav_count != 0 && ztest_random(3) == 0) { newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; newvd_is_spare = B_TRUE; if (newvd->vdev_ops == &vdev_draid_spare_ops) newvd_is_dspare = B_TRUE; (void) strcpy(newpath, newvd->vdev_path); } else { (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, ztest_opts.zo_pool, top * leaves + leaf); if (ztest_random(2) == 0) newpath[strlen(newpath) - 1] = 'b'; newvd = vdev_lookup_by_path(rvd, newpath); } if (newvd) { /* * Reopen to ensure the vdev's asize field isn't stale. */ vdev_reopen(newvd); newsize = vdev_get_min_asize(newvd); } else { /* * Make newsize a little bigger or smaller than oldsize. * If it's smaller, the attach should fail. * If it's larger, and we're doing a replace, * we should get dynamic LUN growth when we're done. */ newsize = 10 * oldsize / (9 + ztest_random(3)); } /* * If pvd is not a mirror or root, the attach should fail with ENOTSUP, * unless it's a replace; in that case any non-replacing parent is OK. * * If newvd is already part of the pool, it should fail with EBUSY. * * If newvd is too small, it should fail with EOVERFLOW. * * If newvd is a distributed spare and it's being attached to a * dRAID which is not its parent it should fail with EINVAL. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops && (!replacing || pvd->vdev_ops == &vdev_replacing_ops || pvd->vdev_ops == &vdev_spare_ops)) expected_error = ENOTSUP; else if (newvd_is_spare && (!replacing || oldvd_is_log)) expected_error = ENOTSUP; else if (newvd == oldvd) expected_error = replacing ? 0 : EBUSY; else if (vdev_lookup_by_path(rvd, newpath) != NULL) expected_error = EBUSY; else if (!newvd_is_dspare && newsize < oldsize) expected_error = EOVERFLOW; else if (ashift > oldvd->vdev_top->vdev_ashift) expected_error = EDOM; else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) expected_error = ENOTSUP; else expected_error = 0; spa_config_exit(spa, SCL_ALL, FTAG); /* * Build the nvlist describing newpath. */ root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, ashift, NULL, 0, 0, 1); /* * When supported select either a healing or sequential resilver. */ boolean_t rebuilding = B_FALSE; if (pvd->vdev_ops == &vdev_mirror_ops || pvd->vdev_ops == &vdev_root_ops) { rebuilding = !!ztest_random(2); } error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); fnvlist_free(root); /* * If our parent was the replacing vdev, but the replace completed, * then instead of failing with ENOTSUP we may either succeed, * fail with ENODEV, or fail with EOVERFLOW. */ if (expected_error == ENOTSUP && (error == 0 || error == ENODEV || error == EOVERFLOW)) expected_error = error; /* * If someone grew the LUN, the replacement may be too small. */ if (error == EOVERFLOW || error == EBUSY) expected_error = error; if (error == ZFS_ERR_CHECKPOINT_EXISTS || error == ZFS_ERR_DISCARDING_CHECKPOINT || error == ZFS_ERR_RESILVER_IN_PROGRESS || error == ZFS_ERR_REBUILD_IN_PROGRESS) expected_error = error; if (error != expected_error && expected_error != EBUSY) { fatal(B_FALSE, "attach (%s %"PRIu64", %s %"PRIu64", %d) " "returned %d, expected %d", oldpath, oldsize, newpath, newsize, replacing, error, expected_error); } out: mutex_exit(&ztest_vdev_lock); umem_free(oldpath, MAXPATHLEN); umem_free(newpath, MAXPATHLEN); } void ztest_device_removal(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa = ztest_spa; vdev_t *vd; uint64_t guid; int error; mutex_enter(&ztest_vdev_lock); if (ztest_device_removal_active) { mutex_exit(&ztest_vdev_lock); return; } /* * Remove a random top-level vdev and wait for removal to finish. */ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); guid = vd->vdev_guid; spa_config_exit(spa, SCL_VDEV, FTAG); error = spa_vdev_remove(spa, guid, B_FALSE); if (error == 0) { ztest_device_removal_active = B_TRUE; mutex_exit(&ztest_vdev_lock); /* * spa->spa_vdev_removal is created in a sync task that * is initiated via dsl_sync_task_nowait(). Since the * task may not run before spa_vdev_remove() returns, we * must wait at least 1 txg to ensure that the removal * struct has been created. */ txg_wait_synced(spa_get_dsl(spa), 0); while (spa->spa_removing_phys.sr_state == DSS_SCANNING) txg_wait_synced(spa_get_dsl(spa), 0); } else { mutex_exit(&ztest_vdev_lock); return; } /* * The pool needs to be scrubbed after completing device removal. * Failure to do so may result in checksum errors due to the * strategy employed by ztest_fault_inject() when selecting which * offset are redundant and can be damaged. */ error = spa_scan(spa, POOL_SCAN_SCRUB); if (error == 0) { while (dsl_scan_scrubbing(spa_get_dsl(spa))) txg_wait_synced(spa_get_dsl(spa), 0); } mutex_enter(&ztest_vdev_lock); ztest_device_removal_active = B_FALSE; mutex_exit(&ztest_vdev_lock); } /* * Callback function which expands the physical size of the vdev. */ static vdev_t * grow_vdev(vdev_t *vd, void *arg) { spa_t *spa __maybe_unused = vd->vdev_spa; size_t *newsize = arg; size_t fsize; int fd; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); if ((fd = open(vd->vdev_path, O_RDWR)) == -1) return (vd); fsize = lseek(fd, 0, SEEK_END); VERIFY0(ftruncate(fd, *newsize)); if (ztest_opts.zo_verbose >= 6) { (void) printf("%s grew from %lu to %lu bytes\n", vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); } (void) close(fd); return (NULL); } /* * Callback function which expands a given vdev by calling vdev_online(). */ static vdev_t * online_vdev(vdev_t *vd, void *arg) { (void) arg; spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint64_t guid = vd->vdev_guid; uint64_t generation = spa->spa_config_generation + 1; vdev_state_t newstate = VDEV_STATE_UNKNOWN; int error; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); /* Calling vdev_online will initialize the new metaslabs */ spa_config_exit(spa, SCL_STATE, spa); error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); spa_config_enter(spa, SCL_STATE, spa, RW_READER); /* * If vdev_online returned an error or the underlying vdev_open * failed then we abort the expand. The only way to know that * vdev_open fails is by checking the returned newstate. */ if (error || newstate != VDEV_STATE_HEALTHY) { if (ztest_opts.zo_verbose >= 5) { (void) printf("Unable to expand vdev, state %u, " "error %d\n", newstate, error); } return (vd); } ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); /* * Since we dropped the lock we need to ensure that we're * still talking to the original vdev. It's possible this * vdev may have been detached/replaced while we were * trying to online it. */ if (generation != spa->spa_config_generation) { if (ztest_opts.zo_verbose >= 5) { (void) printf("vdev configuration has changed, " "guid %"PRIu64", state %"PRIu64", " "expected gen %"PRIu64", got gen %"PRIu64"\n", guid, tvd->vdev_state, generation, spa->spa_config_generation); } return (vd); } return (NULL); } /* * Traverse the vdev tree calling the supplied function. * We continue to walk the tree until we either have walked all * children or we receive a non-NULL return from the callback. * If a NULL callback is passed, then we just return back the first * leaf vdev we encounter. */ static vdev_t * vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) { uint_t c; if (vd->vdev_ops->vdev_op_leaf) { if (func == NULL) return (vd); else return (func(vd, arg)); } for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) return (cvd); } return (NULL); } /* * Verify that dynamic LUN growth works as expected. */ void ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa = ztest_spa; vdev_t *vd, *tvd; metaslab_class_t *mc; metaslab_group_t *mg; size_t psize, newsize; uint64_t top; uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; mutex_enter(&ztest_checkpoint_lock); mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_STATE, spa, RW_READER); /* * If there is a vdev removal in progress, it could complete while * we are running, in which case we would not be able to verify * that the metaslab_class space increased (because it decreases * when the device removal completes). */ if (ztest_device_removal_active) { spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } top = ztest_random_vdev_top(spa, B_TRUE); tvd = spa->spa_root_vdev->vdev_child[top]; mg = tvd->vdev_mg; mc = mg->mg_class; old_ms_count = tvd->vdev_ms_count; old_class_space = metaslab_class_get_space(mc); /* * Determine the size of the first leaf vdev associated with * our top-level device. */ vd = vdev_walk_tree(tvd, NULL, NULL); ASSERT3P(vd, !=, NULL); ASSERT(vd->vdev_ops->vdev_op_leaf); psize = vd->vdev_psize; /* * We only try to expand the vdev if it's healthy, less than 4x its * original size, and it has a valid psize. */ if (tvd->vdev_state != VDEV_STATE_HEALTHY || psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } ASSERT3U(psize, >, 0); newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); ASSERT3U(newsize, >, psize); if (ztest_opts.zo_verbose >= 6) { (void) printf("Expanding LUN %s from %lu to %lu\n", vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); } /* * Growing the vdev is a two step process: * 1). expand the physical size (i.e. relabel) * 2). online the vdev to create the new metaslabs */ if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || vdev_walk_tree(tvd, online_vdev, NULL) != NULL || tvd->vdev_state != VDEV_STATE_HEALTHY) { if (ztest_opts.zo_verbose >= 5) { (void) printf("Could not expand LUN because " "the vdev configuration changed.\n"); } spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } spa_config_exit(spa, SCL_STATE, spa); /* * Expanding the LUN will update the config asynchronously, * thus we must wait for the async thread to complete any * pending tasks before proceeding. */ for (;;) { boolean_t done; mutex_enter(&spa->spa_async_lock); done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); mutex_exit(&spa->spa_async_lock); if (done) break; txg_wait_synced(spa_get_dsl(spa), 0); (void) poll(NULL, 0, 100); } spa_config_enter(spa, SCL_STATE, spa, RW_READER); tvd = spa->spa_root_vdev->vdev_child[top]; new_ms_count = tvd->vdev_ms_count; new_class_space = metaslab_class_get_space(mc); if (tvd->vdev_mg != mg || mg->mg_class != mc) { if (ztest_opts.zo_verbose >= 5) { (void) printf("Could not verify LUN expansion due to " "intervening vdev offline or remove.\n"); } spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } /* * Make sure we were able to grow the vdev. */ if (new_ms_count <= old_ms_count) { fatal(B_FALSE, "LUN expansion failed: ms_count %"PRIu64" < %"PRIu64"\n", old_ms_count, new_ms_count); } /* * Make sure we were able to grow the pool. */ if (new_class_space <= old_class_space) { fatal(B_FALSE, "LUN expansion failed: class_space %"PRIu64" < %"PRIu64"\n", old_class_space, new_class_space); } if (ztest_opts.zo_verbose >= 5) { char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); (void) printf("%s grew from %s to %s\n", spa->spa_name, oldnumbuf, newnumbuf); } spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); } /* * Verify that dmu_objset_{create,destroy,open,close} work as expected. */ static void ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { (void) arg, (void) cr; /* * Create the objects common to all ztest datasets. */ VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); } static int ztest_dataset_create(char *dsname) { int err; uint64_t rand; dsl_crypto_params_t *dcp = NULL; /* * 50% of the time, we create encrypted datasets * using a random cipher suite and a hard-coded * wrapping key. */ rand = ztest_random(2); if (rand != 0) { nvlist_t *crypto_args = fnvlist_alloc(); nvlist_t *props = fnvlist_alloc(); /* slight bias towards the default cipher suite */ rand = ztest_random(ZIO_CRYPT_FUNCTIONS); if (rand < ZIO_CRYPT_AES_128_CCM) rand = ZIO_CRYPT_ON; fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); fnvlist_add_uint8_array(crypto_args, "wkeydata", (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); /* * These parameters aren't really used by the kernel. They * are simply stored so that userspace knows how to load * the wrapping key. */ fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); fnvlist_add_string(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, crypto_args, &dcp)); /* * Cycle through all available encryption implementations * to verify interoperability. */ VERIFY0(gcm_impl_set("cycle")); VERIFY0(aes_impl_set("cycle")); fnvlist_free(crypto_args); fnvlist_free(props); } err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, ztest_objset_create_cb, NULL); dsl_crypto_params_free(dcp, !!err); rand = ztest_random(100); if (err || rand < 80) return (err); if (ztest_opts.zo_verbose >= 5) (void) printf("Setting dataset %s to sync always\n", dsname); return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, ZFS_SYNC_ALWAYS, B_FALSE)); } static int ztest_objset_destroy_cb(const char *name, void *arg) { (void) arg; objset_t *os; dmu_object_info_t doi; int error; /* * Verify that the dataset contains a directory object. */ VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, B_TRUE, FTAG, &os)); error = dmu_object_info(os, ZTEST_DIROBJ, &doi); if (error != ENOENT) { /* We could have crashed in the middle of destroying it */ ASSERT0(error); ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); ASSERT3S(doi.doi_physical_blocks_512, >=, 0); } dmu_objset_disown(os, B_TRUE, FTAG); /* * Destroy the dataset. */ if (strchr(name, '@') != NULL) { error = dsl_destroy_snapshot(name, B_TRUE); if (error != ECHRNG) { /* * The program was executed, but encountered a runtime * error, such as insufficient slop, or a hold on the * dataset. */ ASSERT0(error); } } else { error = dsl_destroy_head(name); if (error == ENOSPC) { /* There could be checkpoint or insufficient slop */ ztest_record_enospc(FTAG); } else if (error != EBUSY) { /* There could be a hold on this dataset */ ASSERT0(error); } } return (0); } static boolean_t ztest_snapshot_create(char *osname, uint64_t id) { char snapname[ZFS_MAX_DATASET_NAME_LEN]; int error; (void) snprintf(snapname, sizeof (snapname), "%"PRIu64"", id); error = dmu_objset_snapshot_one(osname, snapname); if (error == ENOSPC) { ztest_record_enospc(FTAG); return (B_FALSE); } if (error != 0 && error != EEXIST) { fatal(B_FALSE, "ztest_snapshot_create(%s@%s) = %d", osname, snapname, error); } return (B_TRUE); } static boolean_t ztest_snapshot_destroy(char *osname, uint64_t id) { char snapname[ZFS_MAX_DATASET_NAME_LEN]; int error; (void) snprintf(snapname, sizeof (snapname), "%s@%"PRIu64"", osname, id); error = dsl_destroy_snapshot(snapname, B_FALSE); if (error != 0 && error != ENOENT) fatal(B_FALSE, "ztest_snapshot_destroy(%s) = %d", snapname, error); return (B_TRUE); } void ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) { (void) zd; ztest_ds_t *zdtmp; int iters; int error; objset_t *os, *os2; char name[ZFS_MAX_DATASET_NAME_LEN]; zilog_t *zilog; int i; zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); (void) pthread_rwlock_rdlock(&ztest_name_lock); (void) snprintf(name, sizeof (name), "%s/temp_%"PRIu64"", ztest_opts.zo_pool, id); /* * If this dataset exists from a previous run, process its replay log * half of the time. If we don't replay it, then dsl_destroy_head() * (invoked from ztest_objset_destroy_cb()) should just throw it away. */ if (ztest_random(2) == 0 && ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, FTAG, &os) == 0) { ztest_zd_init(zdtmp, NULL, os); zil_replay(os, zdtmp, ztest_replay_vector); ztest_zd_fini(zdtmp); dmu_objset_disown(os, B_TRUE, FTAG); } /* * There may be an old instance of the dataset we're about to * create lying around from a previous run. If so, destroy it * and all of its snapshots. */ (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); /* * Verify that the destroyed dataset is no longer in the namespace. */ VERIFY3U(ENOENT, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, B_TRUE, FTAG, &os)); /* * Verify that we can create a new dataset. */ error = ztest_dataset_create(name); if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(B_FALSE, "dmu_objset_create(%s) = %d", name, error); } VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, FTAG, &os)); ztest_zd_init(zdtmp, NULL, os); /* * Open the intent log for it. */ zilog = zil_open(os, ztest_get_data, NULL); /* * Put some objects in there, do a little I/O to them, * and randomly take a couple of snapshots along the way. */ iters = ztest_random(5); for (i = 0; i < iters; i++) { ztest_dmu_object_alloc_free(zdtmp, id); if (ztest_random(iters) == 0) (void) ztest_snapshot_create(name, i); } /* * Verify that we cannot create an existing dataset. */ VERIFY3U(EEXIST, ==, dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); /* * Verify that we can hold an objset that is also owned. */ VERIFY0(dmu_objset_hold(name, FTAG, &os2)); dmu_objset_rele(os2, FTAG); /* * Verify that we cannot own an objset that is already owned. */ VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, FTAG, &os2)); zil_close(zilog); dmu_objset_disown(os, B_TRUE, FTAG); ztest_zd_fini(zdtmp); out: (void) pthread_rwlock_unlock(&ztest_name_lock); umem_free(zdtmp, sizeof (ztest_ds_t)); } /* * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. */ void ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) { (void) pthread_rwlock_rdlock(&ztest_name_lock); (void) ztest_snapshot_destroy(zd->zd_name, id); (void) ztest_snapshot_create(zd->zd_name, id); (void) pthread_rwlock_unlock(&ztest_name_lock); } /* * Cleanup non-standard snapshots and clones. */ static void ztest_dsl_dataset_cleanup(char *osname, uint64_t id) { char *snap1name; char *clone1name; char *snap2name; char *clone2name; char *snap3name; int error; snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", osname, id); (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", osname, id); (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", clone1name, id); (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", osname, id); (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", clone1name, id); error = dsl_destroy_head(clone2name); if (error && error != ENOENT) fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone2name, error); error = dsl_destroy_snapshot(snap3name, B_FALSE); if (error && error != ENOENT) fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", snap3name, error); error = dsl_destroy_snapshot(snap2name, B_FALSE); if (error && error != ENOENT) fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", snap2name, error); error = dsl_destroy_head(clone1name); if (error && error != ENOENT) fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone1name, error); error = dsl_destroy_snapshot(snap1name, B_FALSE); if (error && error != ENOENT) fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", snap1name, error); umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); } /* * Verify dsl_dataset_promote handles EBUSY */ void ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) { objset_t *os; char *snap1name; char *clone1name; char *snap2name; char *clone2name; char *snap3name; char *osname = zd->zd_name; int error; snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); (void) pthread_rwlock_rdlock(&ztest_name_lock); ztest_dsl_dataset_cleanup(osname, id); (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", osname, id); (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", osname, id); (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", clone1name, id); (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", osname, id); (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", clone1name, id); error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(B_FALSE, "dmu_take_snapshot(%s) = %d", snap1name, error); } error = dmu_objset_clone(clone1name, snap1name); if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone1name, error); } error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap2name, error); } error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap3name, error); } error = dmu_objset_clone(clone2name, snap3name); if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone2name, error); } error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os); if (error) fatal(B_FALSE, "dmu_objset_own(%s) = %d", snap2name, error); error = dsl_dataset_promote(clone2name, NULL); if (error == ENOSPC) { dmu_objset_disown(os, B_TRUE, FTAG); ztest_record_enospc(FTAG); goto out; } if (error != EBUSY) fatal(B_FALSE, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, error); dmu_objset_disown(os, B_TRUE, FTAG); out: ztest_dsl_dataset_cleanup(osname, id); (void) pthread_rwlock_unlock(&ztest_name_lock); umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); } #undef OD_ARRAY_SIZE #define OD_ARRAY_SIZE 4 /* * Verify that dmu_object_{alloc,free} work as expected. */ void ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) { ztest_od_t *od; int batchsize; int size; int b; size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; od = umem_alloc(size, UMEM_NOFAIL); batchsize = OD_ARRAY_SIZE; for (b = 0; b < batchsize; b++) ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0, 0); /* * Destroy the previous batch of objects, create a new batch, * and do some I/O on the new objects. */ if (ztest_object_init(zd, od, size, B_TRUE) != 0) return; while (ztest_random(4 * batchsize) != 0) ztest_io(zd, od[ztest_random(batchsize)].od_object, ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); umem_free(od, size); } /* * Rewind the global allocator to verify object allocation backfilling. */ void ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) { (void) id; objset_t *os = zd->zd_os; int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; uint64_t object; /* * Rewind the global allocator randomly back to a lower object number * to force backfilling and reclamation of recently freed dnodes. */ mutex_enter(&os->os_obj_lock); object = ztest_random(os->os_obj_next_chunk); os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); mutex_exit(&os->os_obj_lock); } #undef OD_ARRAY_SIZE #define OD_ARRAY_SIZE 2 /* * Verify that dmu_{read,write} work as expected. */ void ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) { int size; ztest_od_t *od; objset_t *os = zd->zd_os; size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; od = umem_alloc(size, UMEM_NOFAIL); dmu_tx_t *tx; int freeit, error; uint64_t i, n, s, txg; bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); uint64_t regions = 997; uint64_t stride = 123456789ULL; uint64_t width = 40; int free_percent = 5; /* * This test uses two objects, packobj and bigobj, that are always * updated together (i.e. in the same tx) so that their contents are * in sync and can be compared. Their contents relate to each other * in a simple way: packobj is a dense array of 'bufwad' structures, * while bigobj is a sparse array of the same bufwads. Specifically, * for any index n, there are three bufwads that should be identical: * * packobj, at offset n * sizeof (bufwad_t) * bigobj, at the head of the nth chunk * bigobj, at the tail of the nth chunk * * The chunk size is arbitrary. It doesn't have to be a power of two, * and it doesn't have any relation to the object blocksize. * The only requirement is that it can hold at least two bufwads. * * Normally, we write the bufwad to each of these locations. * However, free_percent of the time we instead write zeroes to * packobj and perform a dmu_free_range() on bigobj. By comparing * bigobj to packobj, we can verify that the DMU is correctly * tracking which parts of an object are allocated and free, * and that the contents of the allocated blocks are correct. */ /* * Read the directory info. If it's the first time, set things up. */ ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, chunksize); if (ztest_object_init(zd, od, size, B_FALSE) != 0) { umem_free(od, size); return; } bigobj = od[0].od_object; packobj = od[1].od_object; chunksize = od[0].od_gen; ASSERT3U(chunksize, ==, od[1].od_gen); /* * Prefetch a random chunk of the big object. * Our aim here is to get some async reads in flight * for blocks that we may free below; the DMU should * handle this race correctly. */ n = ztest_random(regions) * stride + ztest_random(width); s = 1 + ztest_random(2 * width - 1); dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, ZIO_PRIORITY_SYNC_READ); /* * Pick a random index and compute the offsets into packobj and bigobj. */ n = ztest_random(regions) * stride + ztest_random(width); s = 1 + ztest_random(width - 1); packoff = n * sizeof (bufwad_t); packsize = s * sizeof (bufwad_t); bigoff = n * chunksize; bigsize = s * chunksize; packbuf = umem_alloc(packsize, UMEM_NOFAIL); bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); /* * free_percent of the time, free a range of bigobj rather than * overwriting it. */ freeit = (ztest_random(100) < free_percent); /* * Read the current contents of our objects. */ error = dmu_read(os, packobj, packoff, packsize, packbuf, DMU_READ_PREFETCH); ASSERT0(error); error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, DMU_READ_PREFETCH); ASSERT0(error); /* * Get a tx for the mods to both packobj and bigobj. */ tx = dmu_tx_create(os); dmu_tx_hold_write(tx, packobj, packoff, packsize); if (freeit) dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); else dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); /* This accounts for setting the checksum/compression. */ dmu_tx_hold_bonus(tx, bigobj); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) { umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); umem_free(od, size); return; } enum zio_checksum cksum; do { cksum = (enum zio_checksum) ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); dmu_object_set_checksum(os, bigobj, cksum, tx); enum zio_compress comp; do { comp = (enum zio_compress) ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); dmu_object_set_compress(os, bigobj, comp, tx); /* * For each index from n to n + s, verify that the existing bufwad * in packobj matches the bufwads at the head and tail of the * corresponding chunk in bigobj. Then update all three bufwads * with the new values we want to write out. */ for (i = 0; i < s; i++) { /* LINTED */ pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); /* LINTED */ bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); /* LINTED */ bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); if (pack->bw_txg > txg) fatal(B_FALSE, "future leak: got %"PRIx64", open txg is %"PRIx64"", pack->bw_txg, txg); if (pack->bw_data != 0 && pack->bw_index != n + i) fatal(B_FALSE, "wrong index: " "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", pack->bw_index, n, i); if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) fatal(B_FALSE, "pack/bigH mismatch in %p/%p", pack, bigH); if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) fatal(B_FALSE, "pack/bigT mismatch in %p/%p", pack, bigT); if (freeit) { memset(pack, 0, sizeof (bufwad_t)); } else { pack->bw_index = n + i; pack->bw_txg = txg; pack->bw_data = 1 + ztest_random(-2ULL); } *bigH = *pack; *bigT = *pack; } /* * We've verified all the old bufwads, and made new ones. * Now write them out. */ dmu_write(os, packobj, packoff, packsize, packbuf, tx); if (freeit) { if (ztest_opts.zo_verbose >= 7) { (void) printf("freeing offset %"PRIx64" size %"PRIx64"" " txg %"PRIx64"\n", bigoff, bigsize, txg); } VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); } else { if (ztest_opts.zo_verbose >= 7) { (void) printf("writing offset %"PRIx64" size %"PRIx64"" " txg %"PRIx64"\n", bigoff, bigsize, txg); } dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); } dmu_tx_commit(tx); /* * Sanity check the stuff we just wrote. */ { void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); VERIFY0(dmu_read(os, packobj, packoff, packsize, packcheck, DMU_READ_PREFETCH)); VERIFY0(dmu_read(os, bigobj, bigoff, bigsize, bigcheck, DMU_READ_PREFETCH)); ASSERT0(memcmp(packbuf, packcheck, packsize)); ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); umem_free(packcheck, packsize); umem_free(bigcheck, bigsize); } umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); umem_free(od, size); } static void compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) { uint64_t i; bufwad_t *pack; bufwad_t *bigH; bufwad_t *bigT; /* * For each index from n to n + s, verify that the existing bufwad * in packobj matches the bufwads at the head and tail of the * corresponding chunk in bigobj. Then update all three bufwads * with the new values we want to write out. */ for (i = 0; i < s; i++) { /* LINTED */ pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); /* LINTED */ bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); /* LINTED */ bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); if (pack->bw_txg > txg) fatal(B_FALSE, "future leak: got %"PRIx64", open txg is %"PRIx64"", pack->bw_txg, txg); if (pack->bw_data != 0 && pack->bw_index != n + i) fatal(B_FALSE, "wrong index: " "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", pack->bw_index, n, i); if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) fatal(B_FALSE, "pack/bigH mismatch in %p/%p", pack, bigH); if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) fatal(B_FALSE, "pack/bigT mismatch in %p/%p", pack, bigT); pack->bw_index = n + i; pack->bw_txg = txg; pack->bw_data = 1 + ztest_random(-2ULL); *bigH = *pack; *bigT = *pack; } } #undef OD_ARRAY_SIZE #define OD_ARRAY_SIZE 2 void ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; dmu_tx_t *tx; uint64_t i; int error; int size; uint64_t n, s, txg; bufwad_t *packbuf, *bigbuf; uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; uint64_t blocksize = ztest_random_blocksize(); uint64_t chunksize = blocksize; uint64_t regions = 997; uint64_t stride = 123456789ULL; uint64_t width = 9; dmu_buf_t *bonus_db; arc_buf_t **bigbuf_arcbufs; dmu_object_info_t doi; size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; od = umem_alloc(size, UMEM_NOFAIL); /* * This test uses two objects, packobj and bigobj, that are always * updated together (i.e. in the same tx) so that their contents are * in sync and can be compared. Their contents relate to each other * in a simple way: packobj is a dense array of 'bufwad' structures, * while bigobj is a sparse array of the same bufwads. Specifically, * for any index n, there are three bufwads that should be identical: * * packobj, at offset n * sizeof (bufwad_t) * bigobj, at the head of the nth chunk * bigobj, at the tail of the nth chunk * * The chunk size is set equal to bigobj block size so that * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. */ /* * Read the directory info. If it's the first time, set things up. */ ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, chunksize); if (ztest_object_init(zd, od, size, B_FALSE) != 0) { umem_free(od, size); return; } bigobj = od[0].od_object; packobj = od[1].od_object; blocksize = od[0].od_blocksize; chunksize = blocksize; ASSERT3U(chunksize, ==, od[1].od_gen); VERIFY0(dmu_object_info(os, bigobj, &doi)); VERIFY(ISP2(doi.doi_data_block_size)); VERIFY3U(chunksize, ==, doi.doi_data_block_size); VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); /* * Pick a random index and compute the offsets into packobj and bigobj. */ n = ztest_random(regions) * stride + ztest_random(width); s = 1 + ztest_random(width - 1); packoff = n * sizeof (bufwad_t); packsize = s * sizeof (bufwad_t); bigoff = n * chunksize; bigsize = s * chunksize; packbuf = umem_zalloc(packsize, UMEM_NOFAIL); bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); /* * Iteration 0 test zcopy for DB_UNCACHED dbufs. * Iteration 1 test zcopy to already referenced dbufs. * Iteration 2 test zcopy to dirty dbuf in the same txg. * Iteration 3 test zcopy to dbuf dirty in previous txg. * Iteration 4 test zcopy when dbuf is no longer dirty. * Iteration 5 test zcopy when it can't be done. * Iteration 6 one more zcopy write. */ for (i = 0; i < 7; i++) { uint64_t j; uint64_t off; /* * In iteration 5 (i == 5) use arcbufs * that don't match bigobj blksz to test * dmu_assign_arcbuf_by_dbuf() when it can't directly * assign an arcbuf to a dbuf. */ for (j = 0; j < s; j++) { if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { bigbuf_arcbufs[j] = dmu_request_arcbuf(bonus_db, chunksize); } else { bigbuf_arcbufs[2 * j] = dmu_request_arcbuf(bonus_db, chunksize / 2); bigbuf_arcbufs[2 * j + 1] = dmu_request_arcbuf(bonus_db, chunksize / 2); } } /* * Get a tx for the mods to both packobj and bigobj. */ tx = dmu_tx_create(os); dmu_tx_hold_write(tx, packobj, packoff, packsize); dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) { umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); for (j = 0; j < s; j++) { if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { dmu_return_arcbuf(bigbuf_arcbufs[j]); } else { dmu_return_arcbuf( bigbuf_arcbufs[2 * j]); dmu_return_arcbuf( bigbuf_arcbufs[2 * j + 1]); } } umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); umem_free(od, size); dmu_buf_rele(bonus_db, FTAG); return; } /* * 50% of the time don't read objects in the 1st iteration to * test dmu_assign_arcbuf_by_dbuf() for the case when there are * no existing dbufs for the specified offsets. */ if (i != 0 || ztest_random(2) != 0) { error = dmu_read(os, packobj, packoff, packsize, packbuf, DMU_READ_PREFETCH); ASSERT0(error); error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, DMU_READ_PREFETCH); ASSERT0(error); } compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, n, chunksize, txg); /* * We've verified all the old bufwads, and made new ones. * Now write them out. */ dmu_write(os, packobj, packoff, packsize, packbuf, tx); if (ztest_opts.zo_verbose >= 7) { (void) printf("writing offset %"PRIx64" size %"PRIx64"" " txg %"PRIx64"\n", bigoff, bigsize, txg); } for (off = bigoff, j = 0; j < s; j++, off += chunksize) { dmu_buf_t *dbt; if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { memcpy(bigbuf_arcbufs[j]->b_data, (caddr_t)bigbuf + (off - bigoff), chunksize); } else { memcpy(bigbuf_arcbufs[2 * j]->b_data, (caddr_t)bigbuf + (off - bigoff), chunksize / 2); memcpy(bigbuf_arcbufs[2 * j + 1]->b_data, (caddr_t)bigbuf + (off - bigoff) + chunksize / 2, chunksize / 2); } if (i == 1) { VERIFY(dmu_buf_hold(os, bigobj, off, FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); } if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, off, bigbuf_arcbufs[j], tx)); } else { VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, off, bigbuf_arcbufs[2 * j], tx)); VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, off + chunksize / 2, bigbuf_arcbufs[2 * j + 1], tx)); } if (i == 1) { dmu_buf_rele(dbt, FTAG); } } dmu_tx_commit(tx); /* * Sanity check the stuff we just wrote. */ { void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); VERIFY0(dmu_read(os, packobj, packoff, packsize, packcheck, DMU_READ_PREFETCH)); VERIFY0(dmu_read(os, bigobj, bigoff, bigsize, bigcheck, DMU_READ_PREFETCH)); ASSERT0(memcmp(packbuf, packcheck, packsize)); ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); umem_free(packcheck, packsize); umem_free(bigcheck, bigsize); } if (i == 2) { txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); } else if (i == 3) { txg_wait_synced(dmu_objset_pool(os), 0); } } dmu_buf_rele(bonus_db, FTAG); umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); umem_free(od, size); } void ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) { (void) id; ztest_od_t *od; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); uint64_t offset = (1ULL << (ztest_random(20) + 43)) + (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); /* * Have multiple threads write to large offsets in an object * to verify that parallel writes to an object -- even to the * same blocks within the object -- doesn't cause any trouble. */ ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) return; while (ztest_random(10) != 0) ztest_io(zd, od->od_object, offset); umem_free(od, sizeof (ztest_od_t)); } void ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) { ztest_od_t *od; uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); uint64_t count = ztest_random(20) + 1; uint64_t blocksize = ztest_random_blocksize(); void *data; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), !ztest_random(2)) != 0) { umem_free(od, sizeof (ztest_od_t)); return; } if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { umem_free(od, sizeof (ztest_od_t)); return; } ztest_prealloc(zd, od->od_object, offset, count * blocksize); data = umem_zalloc(blocksize, UMEM_NOFAIL); while (ztest_random(count) != 0) { uint64_t randoff = offset + (ztest_random(count) * blocksize); if (ztest_write(zd, od->od_object, randoff, blocksize, data) != 0) break; while (ztest_random(4) != 0) ztest_io(zd, od->od_object, randoff); } umem_free(data, blocksize); umem_free(od, sizeof (ztest_od_t)); } /* * Verify that zap_{create,destroy,add,remove,update} work as expected. */ #define ZTEST_ZAP_MIN_INTS 1 #define ZTEST_ZAP_MAX_INTS 4 #define ZTEST_ZAP_MAX_PROPS 1000 void ztest_zap(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; uint64_t object; uint64_t txg, last_txg; uint64_t value[ZTEST_ZAP_MAX_INTS]; uint64_t zl_ints, zl_intsize, prop; int i, ints; dmu_tx_t *tx; char propname[100], txgname[100]; int error; const char *const hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), !ztest_random(2)) != 0) goto out; object = od->od_object; /* * Generate a known hash collision, and verify that * we can lookup and remove both entries. */ tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) goto out; for (i = 0; i < 2; i++) { value[i] = i; VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 1, &value[i], tx)); } for (i = 0; i < 2; i++) { VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], sizeof (uint64_t), 1, &value[i], tx)); VERIFY0( zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); ASSERT3U(zl_ints, ==, 1); } for (i = 0; i < 2; i++) { VERIFY0(zap_remove(os, object, hc[i], tx)); } dmu_tx_commit(tx); /* * Generate a bunch of random entries. */ ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); prop = ztest_random(ZTEST_ZAP_MAX_PROPS); (void) sprintf(propname, "prop_%"PRIu64"", prop); (void) sprintf(txgname, "txg_%"PRIu64"", prop); memset(value, 0, sizeof (value)); last_txg = 0; /* * If these zap entries already exist, validate their contents. */ error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); if (error == 0) { ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); ASSERT3U(zl_ints, ==, 1); VERIFY0(zap_lookup(os, object, txgname, zl_intsize, zl_ints, &last_txg)); VERIFY0(zap_length(os, object, propname, &zl_intsize, &zl_ints)); ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); ASSERT3U(zl_ints, ==, ints); VERIFY0(zap_lookup(os, object, propname, zl_intsize, zl_ints, value)); for (i = 0; i < ints; i++) { ASSERT3U(value[i], ==, last_txg + object + i); } } else { ASSERT3U(error, ==, ENOENT); } /* * Atomically update two entries in our zap object. * The first is named txg_%llu, and contains the txg * in which the property was last updated. The second * is named prop_%llu, and the nth element of its value * should be txg + object + n. */ tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) goto out; if (last_txg > txg) fatal(B_FALSE, "zap future leak: old %"PRIu64" new %"PRIu64"", last_txg, txg); for (i = 0; i < ints; i++) value[i] = txg + object + i; VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 1, &txg, tx)); VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), ints, value, tx)); dmu_tx_commit(tx); /* * Remove a random pair of entries. */ prop = ztest_random(ZTEST_ZAP_MAX_PROPS); (void) sprintf(propname, "prop_%"PRIu64"", prop); (void) sprintf(txgname, "txg_%"PRIu64"", prop); error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); if (error == ENOENT) goto out; ASSERT0(error); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) goto out; VERIFY0(zap_remove(os, object, txgname, tx)); VERIFY0(zap_remove(os, object, propname, tx)); dmu_tx_commit(tx); out: umem_free(od, sizeof (ztest_od_t)); } /* * Test case to test the upgrading of a microzap to fatzap. */ void ztest_fzap(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; uint64_t object, txg, value; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), !ztest_random(2)) != 0) goto out; object = od->od_object; /* * Add entries to this ZAP and make sure it spills over * and gets upgraded to a fatzap. Also, since we are adding * 2050 entries we should see ptrtbl growth and leaf-block split. */ for (value = 0; value < 2050; value++) { char name[ZFS_MAX_DATASET_NAME_LEN]; dmu_tx_t *tx; int error; (void) snprintf(name, sizeof (name), "fzap-%"PRIu64"-%"PRIu64"", id, value); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, name); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) goto out; error = zap_add(os, object, name, sizeof (uint64_t), 1, &value, tx); ASSERT(error == 0 || error == EEXIST); dmu_tx_commit(tx); } out: umem_free(od, sizeof (ztest_od_t)); } void ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) { (void) id; objset_t *os = zd->zd_os; ztest_od_t *od; uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; dmu_tx_t *tx; int i, namelen, error; int micro = ztest_random(2); char name[20], string_value[20]; void *data; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { umem_free(od, sizeof (ztest_od_t)); return; } object = od->od_object; /* * Generate a random name of the form 'xxx.....' where each * x is a random printable character and the dots are dots. * There are 94 such characters, and the name length goes from * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. */ namelen = ztest_random(sizeof (name) - 5) + 5 + 1; for (i = 0; i < 3; i++) name[i] = '!' + ztest_random('~' - '!' + 1); for (; i < namelen - 1; i++) name[i] = '.'; name[i] = '\0'; if ((namelen & 1) || micro) { wsize = sizeof (txg); wc = 1; data = &txg; } else { wsize = 1; wc = namelen; data = string_value; } count = -1ULL; VERIFY0(zap_count(os, object, &count)); ASSERT3S(count, !=, -1ULL); /* * Select an operation: length, lookup, add, update, remove. */ i = ztest_random(5); if (i >= 2) { tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) { umem_free(od, sizeof (ztest_od_t)); return; } memcpy(string_value, name, namelen); } else { tx = NULL; txg = 0; memset(string_value, 0, namelen); } switch (i) { case 0: error = zap_length(os, object, name, &zl_wsize, &zl_wc); if (error == 0) { ASSERT3U(wsize, ==, zl_wsize); ASSERT3U(wc, ==, zl_wc); } else { ASSERT3U(error, ==, ENOENT); } break; case 1: error = zap_lookup(os, object, name, wsize, wc, data); if (error == 0) { if (data == string_value && memcmp(name, data, namelen) != 0) fatal(B_FALSE, "name '%s' != val '%s' len %d", name, (char *)data, namelen); } else { ASSERT3U(error, ==, ENOENT); } break; case 2: error = zap_add(os, object, name, wsize, wc, data, tx); ASSERT(error == 0 || error == EEXIST); break; case 3: VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); break; case 4: error = zap_remove(os, object, name, tx); ASSERT(error == 0 || error == ENOENT); break; } if (tx != NULL) dmu_tx_commit(tx); umem_free(od, sizeof (ztest_od_t)); } /* * Commit callback data. */ typedef struct ztest_cb_data { list_node_t zcd_node; uint64_t zcd_txg; int zcd_expected_err; boolean_t zcd_added; boolean_t zcd_called; spa_t *zcd_spa; } ztest_cb_data_t; /* This is the actual commit callback function */ static void ztest_commit_callback(void *arg, int error) { ztest_cb_data_t *data = arg; uint64_t synced_txg; VERIFY3P(data, !=, NULL); VERIFY3S(data->zcd_expected_err, ==, error); VERIFY(!data->zcd_called); synced_txg = spa_last_synced_txg(data->zcd_spa); if (data->zcd_txg > synced_txg) fatal(B_FALSE, "commit callback of txg %"PRIu64" called prematurely, " "last synced txg = %"PRIu64"\n", data->zcd_txg, synced_txg); data->zcd_called = B_TRUE; if (error == ECANCELED) { ASSERT0(data->zcd_txg); ASSERT(!data->zcd_added); /* * The private callback data should be destroyed here, but * since we are going to check the zcd_called field after * dmu_tx_abort(), we will destroy it there. */ return; } ASSERT(data->zcd_added); ASSERT3U(data->zcd_txg, !=, 0); (void) mutex_enter(&zcl.zcl_callbacks_lock); /* See if this cb was called more quickly */ if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) zc_min_txg_delay = synced_txg - data->zcd_txg; /* Remove our callback from the list */ list_remove(&zcl.zcl_callbacks, data); (void) mutex_exit(&zcl.zcl_callbacks_lock); umem_free(data, sizeof (ztest_cb_data_t)); } /* Allocate and initialize callback data structure */ static ztest_cb_data_t * ztest_create_cb_data(objset_t *os, uint64_t txg) { ztest_cb_data_t *cb_data; cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); cb_data->zcd_txg = txg; cb_data->zcd_spa = dmu_objset_spa(os); list_link_init(&cb_data->zcd_node); return (cb_data); } /* * Commit callback test. */ void ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; dmu_tx_t *tx; ztest_cb_data_t *cb_data[3], *tmp_cb; uint64_t old_txg, txg; int i, error = 0; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { umem_free(od, sizeof (ztest_od_t)); return; } tx = dmu_tx_create(os); cb_data[0] = ztest_create_cb_data(os, 0); dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); /* Every once in a while, abort the transaction on purpose */ if (ztest_random(100) == 0) error = -1; if (!error) error = dmu_tx_assign(tx, TXG_NOWAIT); txg = error ? 0 : dmu_tx_get_txg(tx); cb_data[0]->zcd_txg = txg; cb_data[1] = ztest_create_cb_data(os, txg); dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); if (error) { /* * It's not a strict requirement to call the registered * callbacks from inside dmu_tx_abort(), but that's what * it's supposed to happen in the current implementation * so we will check for that. */ for (i = 0; i < 2; i++) { cb_data[i]->zcd_expected_err = ECANCELED; VERIFY(!cb_data[i]->zcd_called); } dmu_tx_abort(tx); for (i = 0; i < 2; i++) { VERIFY(cb_data[i]->zcd_called); umem_free(cb_data[i], sizeof (ztest_cb_data_t)); } umem_free(od, sizeof (ztest_od_t)); return; } cb_data[2] = ztest_create_cb_data(os, txg); dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); /* * Read existing data to make sure there isn't a future leak. */ VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), &old_txg, DMU_READ_PREFETCH)); if (old_txg > txg) fatal(B_FALSE, "future leak: got %"PRIu64", open txg is %"PRIu64"", old_txg, txg); dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); (void) mutex_enter(&zcl.zcl_callbacks_lock); /* * Since commit callbacks don't have any ordering requirement and since * it is theoretically possible for a commit callback to be called * after an arbitrary amount of time has elapsed since its txg has been * synced, it is difficult to reliably determine whether a commit * callback hasn't been called due to high load or due to a flawed * implementation. * * In practice, we will assume that if after a certain number of txgs a * commit callback hasn't been called, then most likely there's an * implementation bug.. */ tmp_cb = list_head(&zcl.zcl_callbacks); if (tmp_cb != NULL && tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { fatal(B_FALSE, "Commit callback threshold exceeded, " "oldest txg: %"PRIu64", open txg: %"PRIu64"\n", tmp_cb->zcd_txg, txg); } /* * Let's find the place to insert our callbacks. * * Even though the list is ordered by txg, it is possible for the * insertion point to not be the end because our txg may already be * quiescing at this point and other callbacks in the open txg * (from other objsets) may have sneaked in. */ tmp_cb = list_tail(&zcl.zcl_callbacks); while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); /* Add the 3 callbacks to the list */ for (i = 0; i < 3; i++) { if (tmp_cb == NULL) list_insert_head(&zcl.zcl_callbacks, cb_data[i]); else list_insert_after(&zcl.zcl_callbacks, tmp_cb, cb_data[i]); cb_data[i]->zcd_added = B_TRUE; VERIFY(!cb_data[i]->zcd_called); tmp_cb = cb_data[i]; } zc_cb_counter += 3; (void) mutex_exit(&zcl.zcl_callbacks_lock); dmu_tx_commit(tx); umem_free(od, sizeof (ztest_od_t)); } /* * Visit each object in the dataset. Verify that its properties * are consistent what was stored in the block tag when it was created, * and that its unused bonus buffer space has not been overwritten. */ void ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) { (void) id; objset_t *os = zd->zd_os; uint64_t obj; int err = 0; for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { ztest_block_tag_t *bt = NULL; dmu_object_info_t doi; dmu_buf_t *db; ztest_object_lock(zd, obj, RL_READER); if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { ztest_object_unlock(zd, obj); continue; } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_size >= sizeof (*bt)) bt = ztest_bt_bonus(db); if (bt && bt->bt_magic == BT_MAGIC) { ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, bt->bt_offset, bt->bt_gen, bt->bt_txg, bt->bt_crtxg); ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); } dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, obj); } } void ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) { (void) id; zfs_prop_t proplist[] = { ZFS_PROP_CHECKSUM, ZFS_PROP_COMPRESSION, ZFS_PROP_COPIES, ZFS_PROP_DEDUP }; (void) pthread_rwlock_rdlock(&ztest_name_lock); for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); VERIFY0(ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, ztest_random_blocksize(), (int)ztest_random(2))); (void) pthread_rwlock_unlock(&ztest_name_lock); } void ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; nvlist_t *props = NULL; (void) pthread_rwlock_rdlock(&ztest_name_lock); (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); VERIFY0(spa_prop_get(ztest_spa, &props)); if (ztest_opts.zo_verbose >= 6) dump_nvlist(props, 4); fnvlist_free(props); (void) pthread_rwlock_unlock(&ztest_name_lock); } static int user_release_one(const char *snapname, const char *holdname) { nvlist_t *snaps, *holds; int error; snaps = fnvlist_alloc(); holds = fnvlist_alloc(); fnvlist_add_boolean(holds, holdname); fnvlist_add_nvlist(snaps, snapname, holds); fnvlist_free(holds); error = dsl_dataset_user_release(snaps, NULL); fnvlist_free(snaps); return (error); } /* * Test snapshot hold/release and deferred destroy. */ void ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) { int error; objset_t *os = zd->zd_os; objset_t *origin; char snapname[100]; char fullname[100]; char clonename[100]; char tag[100]; char osname[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *holds; (void) pthread_rwlock_rdlock(&ztest_name_lock); dmu_objset_name(os, osname); (void) snprintf(snapname, sizeof (snapname), "sh1_%"PRIu64"", id); (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%"PRIu64"", osname, id); (void) snprintf(tag, sizeof (tag), "tag_%"PRIu64"", id); /* * Clean up from any previous run. */ error = dsl_destroy_head(clonename); if (error != ENOENT) ASSERT0(error); error = user_release_one(fullname, tag); if (error != ESRCH && error != ENOENT) ASSERT0(error); error = dsl_destroy_snapshot(fullname, B_FALSE); if (error != ENOENT) ASSERT0(error); /* * Create snapshot, clone it, mark snap for deferred destroy, * destroy clone, verify snap was also destroyed. */ error = dmu_objset_snapshot_one(osname, snapname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_snapshot"); goto out; } fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); } error = dmu_objset_clone(clonename, fullname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_clone"); goto out; } fatal(B_FALSE, "dmu_objset_clone(%s) = %d", clonename, error); } error = dsl_destroy_snapshot(fullname, B_TRUE); if (error) { fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", fullname, error); } error = dsl_destroy_head(clonename); if (error) fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clonename, error); error = dmu_objset_hold(fullname, FTAG, &origin); if (error != ENOENT) fatal(B_FALSE, "dmu_objset_hold(%s) = %d", fullname, error); /* * Create snapshot, add temporary hold, verify that we can't * destroy a held snapshot, mark for deferred destroy, * release hold, verify snapshot was destroyed. */ error = dmu_objset_snapshot_one(osname, snapname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_snapshot"); goto out; } fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); } holds = fnvlist_alloc(); fnvlist_add_string(holds, fullname, tag); error = dsl_dataset_user_hold(holds, 0, NULL); fnvlist_free(holds); if (error == ENOSPC) { ztest_record_enospc("dsl_dataset_user_hold"); goto out; } else if (error) { fatal(B_FALSE, "dsl_dataset_user_hold(%s, %s) = %u", fullname, tag, error); } error = dsl_destroy_snapshot(fullname, B_FALSE); if (error != EBUSY) { fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_FALSE) = %d", fullname, error); } error = dsl_destroy_snapshot(fullname, B_TRUE); if (error) { fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", fullname, error); } error = user_release_one(fullname, tag); if (error) fatal(B_FALSE, "user_release_one(%s, %s) = %d", fullname, tag, error); VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); out: (void) pthread_rwlock_unlock(&ztest_name_lock); } /* * Inject random faults into the on-disk data. */ void ztest_fault_inject(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; int fd; uint64_t offset; uint64_t leaves; uint64_t bad = 0x1990c0ffeedecadeull; uint64_t top, leaf; char *path0; char *pathrand; size_t fsize; int bshift = SPA_MAXBLOCKSHIFT + 2; int iters = 1000; int maxfaults; int mirror_save; vdev_t *vd0 = NULL; uint64_t guid0 = 0; boolean_t islog = B_FALSE; path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); mutex_enter(&ztest_vdev_lock); /* * Device removal is in progress, fault injection must be disabled * until it completes and the pool is scrubbed. The fault injection * strategy for damaging blocks does not take in to account evacuated * blocks which may have already been damaged. */ if (ztest_device_removal_active) { mutex_exit(&ztest_vdev_lock); goto out; } maxfaults = MAXFAULTS(zs); leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; mirror_save = zs->zs_mirrors; mutex_exit(&ztest_vdev_lock); ASSERT3U(leaves, >=, 1); /* * While ztest is running the number of leaves will not change. This * is critical for the fault injection logic as it determines where * errors can be safely injected such that they are always repairable. * * When restarting ztest a different number of leaves may be requested * which will shift the regions to be damaged. This is fine as long * as the pool has been scrubbed prior to using the new mapping. * Failure to do can result in non-repairable damage being injected. */ if (ztest_pool_scrubbed == B_FALSE) goto out; /* * Grab the name lock as reader. There are some operations * which don't like to have their vdevs changed while * they are in progress (i.e. spa_change_guid). Those * operations will have grabbed the name lock as writer. */ (void) pthread_rwlock_rdlock(&ztest_name_lock); /* * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (ztest_random(2) == 0) { /* * Inject errors on a normal data device or slog device. */ top = ztest_random_vdev_top(spa, B_TRUE); leaf = ztest_random(leaves) + zs->zs_splits; /* * Generate paths to the first leaf in this top-level vdev, * and to the random leaf we selected. We'll induce transient * write failures and random online/offline activity on leaf 0, * and we'll write random garbage to the randomly chosen leaf. */ (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, ztest_opts.zo_pool, top * leaves + zs->zs_splits); (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, ztest_opts.zo_pool, top * leaves + leaf); vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); if (vd0 != NULL && vd0->vdev_top->vdev_islog) islog = B_TRUE; /* * If the top-level vdev needs to be resilvered * then we only allow faults on the device that is * resilvering. */ if (vd0 != NULL && maxfaults != 1 && (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || vd0->vdev_resilver_txg != 0)) { /* * Make vd0 explicitly claim to be unreadable, * or unwritable, or reach behind its back * and close the underlying fd. We can do this if * maxfaults == 0 because we'll fail and reexecute, * and we can do it if maxfaults >= 2 because we'll * have enough redundancy. If maxfaults == 1, the * combination of this with injection of random data * corruption below exceeds the pool's fault tolerance. */ vdev_file_t *vf = vd0->vdev_tsd; zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", (long long)vd0->vdev_id, (int)maxfaults); if (vf != NULL && ztest_random(3) == 0) { (void) close(vf->vf_file->f_fd); vf->vf_file->f_fd = -1; } else if (ztest_random(2) == 0) { vd0->vdev_cant_read = B_TRUE; } else { vd0->vdev_cant_write = B_TRUE; } guid0 = vd0->vdev_guid; } } else { /* * Inject errors on an l2cache device. */ spa_aux_vdev_t *sav = &spa->spa_l2cache; if (sav->sav_count == 0) { spa_config_exit(spa, SCL_STATE, FTAG); (void) pthread_rwlock_unlock(&ztest_name_lock); goto out; } vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; guid0 = vd0->vdev_guid; (void) strcpy(path0, vd0->vdev_path); (void) strcpy(pathrand, vd0->vdev_path); leaf = 0; leaves = 1; maxfaults = INT_MAX; /* no limit on cache devices */ } spa_config_exit(spa, SCL_STATE, FTAG); (void) pthread_rwlock_unlock(&ztest_name_lock); /* * If we can tolerate two or more faults, or we're dealing * with a slog, randomly online/offline vd0. */ if ((maxfaults >= 2 || islog) && guid0 != 0) { if (ztest_random(10) < 6) { int flags = (ztest_random(2) == 0 ? ZFS_OFFLINE_TEMPORARY : 0); /* * We have to grab the zs_name_lock as writer to * prevent a race between offlining a slog and * destroying a dataset. Offlining the slog will * grab a reference on the dataset which may cause * dsl_destroy_head() to fail with EBUSY thus * leaving the dataset in an inconsistent state. */ if (islog) (void) pthread_rwlock_wrlock(&ztest_name_lock); VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); if (islog) (void) pthread_rwlock_unlock(&ztest_name_lock); } else { /* * Ideally we would like to be able to randomly * call vdev_[on|off]line without holding locks * to force unpredictable failures but the side * effects of vdev_[on|off]line prevent us from * doing so. We grab the ztest_vdev_lock here to * prevent a race between injection testing and * aux_vdev removal. */ mutex_enter(&ztest_vdev_lock); (void) vdev_online(spa, guid0, 0, NULL); mutex_exit(&ztest_vdev_lock); } } if (maxfaults == 0) goto out; /* * We have at least single-fault tolerance, so inject data corruption. */ fd = open(pathrand, O_RDWR); if (fd == -1) /* we hit a gap in the device namespace */ goto out; fsize = lseek(fd, 0, SEEK_END); while (--iters != 0) { /* * The offset must be chosen carefully to ensure that * we do not inject a given logical block with errors * on two different leaf devices, because ZFS can not * tolerate that (if maxfaults==1). * * To achieve this we divide each leaf device into * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). * Each chunk is further divided into error-injection * ranges (can accept errors) and clear ranges (we do * not inject errors in those). Each error-injection * range can accept errors only for a single leaf vdev. * Error-injection ranges are separated by clear ranges. * * For example, with 3 leaves, each chunk looks like: * 0 to 32M: injection range for leaf 0 * 32M to 64M: clear range - no injection allowed * 64M to 96M: injection range for leaf 1 * 96M to 128M: clear range - no injection allowed * 128M to 160M: injection range for leaf 2 * 160M to 192M: clear range - no injection allowed * * Each clear range must be large enough such that a * single block cannot straddle it. This way a block * can't be a target in two different injection ranges * (on different leaf vdevs). */ offset = ztest_random(fsize / (leaves << bshift)) * (leaves << bshift) + (leaf << bshift) + (ztest_random(1ULL << (bshift - 1)) & -8ULL); /* * Only allow damage to the labels at one end of the vdev. * * If all labels are damaged, the device will be totally * inaccessible, which will result in loss of data, * because we also damage (parts of) the other side of * the mirror/raidz. * * Additionally, we will always have both an even and an * odd label, so that we can handle crashes in the * middle of vdev_config_sync(). */ if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) continue; /* * The two end labels are stored at the "end" of the disk, but * the end of the disk (vdev_psize) is aligned to * sizeof (vdev_label_t). */ uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); if ((leaf & 1) == 1 && offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) continue; mutex_enter(&ztest_vdev_lock); if (mirror_save != zs->zs_mirrors) { mutex_exit(&ztest_vdev_lock); (void) close(fd); goto out; } if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) fatal(B_TRUE, "can't inject bad word at 0x%"PRIx64" in %s", offset, pathrand); mutex_exit(&ztest_vdev_lock); if (ztest_opts.zo_verbose >= 7) (void) printf("injected bad word into %s," " offset 0x%"PRIx64"\n", pathrand, offset); } (void) close(fd); out: umem_free(path0, MAXPATHLEN); umem_free(pathrand, MAXPATHLEN); } /* * By design ztest will never inject uncorrectable damage in to the pool. * Issue a scrub, wait for it to complete, and verify there is never any * persistent damage. * * Only after a full scrub has been completed is it safe to start injecting * data corruption. See the comment in zfs_fault_inject(). */ static int ztest_scrub_impl(spa_t *spa) { int error = spa_scan(spa, POOL_SCAN_SCRUB); if (error) return (error); while (dsl_scan_scrubbing(spa_get_dsl(spa))) txg_wait_synced(spa_get_dsl(spa), 0); if (spa_get_errlog_size(spa) > 0) return (ECKSUM); ztest_pool_scrubbed = B_TRUE; return (0); } /* * Scrub the pool. */ void ztest_scrub(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa = ztest_spa; int error; /* * Scrub in progress by device removal. */ if (ztest_device_removal_active) return; /* * Start a scrub, wait a moment, then force a restart. */ (void) spa_scan(spa, POOL_SCAN_SCRUB); (void) poll(NULL, 0, 100); error = ztest_scrub_impl(spa); if (error == EBUSY) error = 0; ASSERT0(error); } /* * Change the guid for the pool. */ void ztest_reguid(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa = ztest_spa; uint64_t orig, load; int error; if (ztest_opts.zo_mmp_test) return; orig = spa_guid(spa); load = spa_load_guid(spa); (void) pthread_rwlock_wrlock(&ztest_name_lock); error = spa_change_guid(spa); (void) pthread_rwlock_unlock(&ztest_name_lock); if (error != 0) return; if (ztest_opts.zo_verbose >= 4) { (void) printf("Changed guid old %"PRIu64" -> %"PRIu64"\n", orig, spa_guid(spa)); } VERIFY3U(orig, !=, spa_guid(spa)); VERIFY3U(load, ==, spa_load_guid(spa)); } void ztest_blake3(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; hrtime_t end = gethrtime() + NANOSEC; zio_cksum_salt_t salt; void *salt_ptr = &salt.zcs_bytes; struct abd *abd_data, *abd_meta; void *buf, *templ; int i, *ptr; uint32_t size; BLAKE3_CTX ctx; size = ztest_random_blocksize(); buf = umem_alloc(size, UMEM_NOFAIL); abd_data = abd_alloc(size, B_FALSE); abd_meta = abd_alloc(size, B_TRUE); for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) *ptr = ztest_random(UINT_MAX); memset(salt_ptr, 'A', 32); abd_copy_from_buf_off(abd_data, buf, 0, size); abd_copy_from_buf_off(abd_meta, buf, 0, size); while (gethrtime() <= end) { int run_count = 100; zio_cksum_t zc_ref1, zc_ref2; zio_cksum_t zc_res1, zc_res2; void *ref1 = &zc_ref1; void *ref2 = &zc_ref2; void *res1 = &zc_res1; void *res2 = &zc_res2; /* BLAKE3_KEY_LEN = 32 */ VERIFY0(blake3_impl_setname("generic")); templ = abd_checksum_blake3_tmpl_init(&salt); Blake3_InitKeyed(&ctx, salt_ptr); Blake3_Update(&ctx, buf, size); Blake3_Final(&ctx, ref1); zc_ref2 = zc_ref1; ZIO_CHECKSUM_BSWAP(&zc_ref2); abd_checksum_blake3_tmpl_free(templ); VERIFY0(blake3_impl_setname("cycle")); while (run_count-- > 0) { /* Test current implementation */ Blake3_InitKeyed(&ctx, salt_ptr); Blake3_Update(&ctx, buf, size); Blake3_Final(&ctx, res1); zc_res2 = zc_res1; ZIO_CHECKSUM_BSWAP(&zc_res2); VERIFY0(memcmp(ref1, res1, 32)); VERIFY0(memcmp(ref2, res2, 32)); /* Test ABD - data */ templ = abd_checksum_blake3_tmpl_init(&salt); abd_checksum_blake3_native(abd_data, size, templ, &zc_res1); abd_checksum_blake3_byteswap(abd_data, size, templ, &zc_res2); VERIFY0(memcmp(ref1, res1, 32)); VERIFY0(memcmp(ref2, res2, 32)); /* Test ABD - metadata */ abd_checksum_blake3_native(abd_meta, size, templ, &zc_res1); abd_checksum_blake3_byteswap(abd_meta, size, templ, &zc_res2); abd_checksum_blake3_tmpl_free(templ); VERIFY0(memcmp(ref1, res1, 32)); VERIFY0(memcmp(ref2, res2, 32)); } } abd_free(abd_data); abd_free(abd_meta); umem_free(buf, size); } void ztest_fletcher(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; hrtime_t end = gethrtime() + NANOSEC; while (gethrtime() <= end) { int run_count = 100; void *buf; struct abd *abd_data, *abd_meta; uint32_t size; int *ptr; int i; zio_cksum_t zc_ref; zio_cksum_t zc_ref_byteswap; size = ztest_random_blocksize(); buf = umem_alloc(size, UMEM_NOFAIL); abd_data = abd_alloc(size, B_FALSE); abd_meta = abd_alloc(size, B_TRUE); for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) *ptr = ztest_random(UINT_MAX); abd_copy_from_buf_off(abd_data, buf, 0, size); abd_copy_from_buf_off(abd_meta, buf, 0, size); VERIFY0(fletcher_4_impl_set("scalar")); fletcher_4_native(buf, size, NULL, &zc_ref); fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); VERIFY0(fletcher_4_impl_set("cycle")); while (run_count-- > 0) { zio_cksum_t zc; zio_cksum_t zc_byteswap; fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); fletcher_4_native(buf, size, NULL, &zc); VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, sizeof (zc_byteswap))); /* Test ABD - data */ abd_fletcher_4_byteswap(abd_data, size, NULL, &zc_byteswap); abd_fletcher_4_native(abd_data, size, NULL, &zc); VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, sizeof (zc_byteswap))); /* Test ABD - metadata */ abd_fletcher_4_byteswap(abd_meta, size, NULL, &zc_byteswap); abd_fletcher_4_native(abd_meta, size, NULL, &zc); VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, sizeof (zc_byteswap))); } umem_free(buf, size); abd_free(abd_data); abd_free(abd_meta); } } void ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; void *buf; size_t size; int *ptr; int i; zio_cksum_t zc_ref; zio_cksum_t zc_ref_bswap; hrtime_t end = gethrtime() + NANOSEC; while (gethrtime() <= end) { int run_count = 100; size = ztest_random_blocksize(); buf = umem_alloc(size, UMEM_NOFAIL); for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) *ptr = ztest_random(UINT_MAX); VERIFY0(fletcher_4_impl_set("scalar")); fletcher_4_native(buf, size, NULL, &zc_ref); fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); VERIFY0(fletcher_4_impl_set("cycle")); while (run_count-- > 0) { zio_cksum_t zc; zio_cksum_t zc_bswap; size_t pos = 0; ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); while (pos < size) { size_t inc = 64 * ztest_random(size / 67); /* sometimes add few bytes to test non-simd */ if (ztest_random(100) < 10) inc += P2ALIGN(ztest_random(64), sizeof (uint32_t)); if (inc > (size - pos)) inc = size - pos; fletcher_4_incremental_native(buf + pos, inc, &zc); fletcher_4_incremental_byteswap(buf + pos, inc, &zc_bswap); pos += inc; } VERIFY3U(pos, ==, size); VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); /* * verify if incremental on the whole buffer is * equivalent to non-incremental version */ ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); fletcher_4_incremental_native(buf, size, &zc); fletcher_4_incremental_byteswap(buf, size, &zc_bswap); VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); } umem_free(buf, size); } } static int ztest_set_global_vars(void) { for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { char *kv = ztest_opts.zo_gvars[i]; VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); VERIFY3U(strlen(kv), >, 0); int err = set_global_var(kv); if (ztest_opts.zo_verbose > 0) { (void) printf("setting global var %s ... %s\n", kv, err ? "failed" : "ok"); } if (err != 0) { (void) fprintf(stderr, "failed to set global var '%s'\n", kv); return (err); } } return (0); } static char ** ztest_global_vars_to_zdb_args(void) { char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); char **cur = args; for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { *cur++ = (char *)"-o"; *cur++ = ztest_opts.zo_gvars[i]; } ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); *cur = NULL; return (args); } /* The end of strings is indicated by a NULL element */ static char * join_strings(char **strings, const char *sep) { size_t totallen = 0; for (char **sp = strings; *sp != NULL; sp++) { totallen += strlen(*sp); totallen += strlen(sep); } if (totallen > 0) { ASSERT(totallen >= strlen(sep)); totallen -= strlen(sep); } size_t buflen = totallen + 1; char *o = malloc(buflen); /* trailing 0 byte */ o[0] = '\0'; for (char **sp = strings; *sp != NULL; sp++) { size_t would; would = strlcat(o, *sp, buflen); VERIFY3U(would, <, buflen); if (*(sp+1) == NULL) { break; } would = strlcat(o, sep, buflen); VERIFY3U(would, <, buflen); } ASSERT3S(strlen(o), ==, totallen); return (o); } static int ztest_check_path(char *path) { struct stat s; /* return true on success */ return (!stat(path, &s)); } static void ztest_get_zdb_bin(char *bin, int len) { char *zdb_path; /* * Try to use $ZDB and in-tree zdb path. If not successful, just * let popen to search through PATH. */ if ((zdb_path = getenv("ZDB"))) { strlcpy(bin, zdb_path, len); /* In env */ if (!ztest_check_path(bin)) { ztest_dump_core = 0; fatal(B_TRUE, "invalid ZDB '%s'", bin); } return; } VERIFY3P(realpath(getexecname(), bin), !=, NULL); if (strstr(bin, ".libs/ztest")) { strstr(bin, ".libs/ztest")[0] = '\0'; /* In-tree */ strcat(bin, "zdb"); if (ztest_check_path(bin)) return; } strcpy(bin, "zdb"); } static vdev_t * ztest_random_concrete_vdev_leaf(vdev_t *vd) { if (vd == NULL) return (NULL); if (vd->vdev_children == 0) return (vd); vdev_t *eligible[vd->vdev_children]; int eligible_idx = 0, i; for (i = 0; i < vd->vdev_children; i++) { vdev_t *cvd = vd->vdev_child[i]; if (cvd->vdev_top->vdev_removing) continue; if (cvd->vdev_children > 0 || (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { eligible[eligible_idx++] = cvd; } } VERIFY3S(eligible_idx, >, 0); uint64_t child_no = ztest_random(eligible_idx); return (ztest_random_concrete_vdev_leaf(eligible[child_no])); } void ztest_initialize(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa = ztest_spa; int error = 0; mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* Random leaf vdev */ vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); if (rand_vd == NULL) { spa_config_exit(spa, SCL_VDEV, FTAG); mutex_exit(&ztest_vdev_lock); return; } /* * The random vdev we've selected may change as soon as we * drop the spa_config_lock. We create local copies of things * we're interested in. */ uint64_t guid = rand_vd->vdev_guid; char *path = strdup(rand_vd->vdev_path); boolean_t active = rand_vd->vdev_initialize_thread != NULL; zfs_dbgmsg("vd %px, guid %llu", rand_vd, (u_longlong_t)guid); spa_config_exit(spa, SCL_VDEV, FTAG); uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); nvlist_t *vdev_guids = fnvlist_alloc(); nvlist_t *vdev_errlist = fnvlist_alloc(); fnvlist_add_uint64(vdev_guids, path, guid); error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); fnvlist_free(vdev_guids); fnvlist_free(vdev_errlist); switch (cmd) { case POOL_INITIALIZE_CANCEL: if (ztest_opts.zo_verbose >= 4) { (void) printf("Cancel initialize %s", path); if (!active) (void) printf(" failed (no initialize active)"); (void) printf("\n"); } break; case POOL_INITIALIZE_START: if (ztest_opts.zo_verbose >= 4) { (void) printf("Start initialize %s", path); if (active && error == 0) (void) printf(" failed (already active)"); else if (error != 0) (void) printf(" failed (error %d)", error); (void) printf("\n"); } break; case POOL_INITIALIZE_SUSPEND: if (ztest_opts.zo_verbose >= 4) { (void) printf("Suspend initialize %s", path); if (!active) (void) printf(" failed (no initialize active)"); (void) printf("\n"); } break; } free(path); mutex_exit(&ztest_vdev_lock); } void ztest_trim(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa = ztest_spa; int error = 0; mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* Random leaf vdev */ vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); if (rand_vd == NULL) { spa_config_exit(spa, SCL_VDEV, FTAG); mutex_exit(&ztest_vdev_lock); return; } /* * The random vdev we've selected may change as soon as we * drop the spa_config_lock. We create local copies of things * we're interested in. */ uint64_t guid = rand_vd->vdev_guid; char *path = strdup(rand_vd->vdev_path); boolean_t active = rand_vd->vdev_trim_thread != NULL; zfs_dbgmsg("vd %p, guid %llu", rand_vd, (u_longlong_t)guid); spa_config_exit(spa, SCL_VDEV, FTAG); uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); uint64_t rate = 1 << ztest_random(30); boolean_t partial = (ztest_random(5) > 0); boolean_t secure = (ztest_random(5) > 0); nvlist_t *vdev_guids = fnvlist_alloc(); nvlist_t *vdev_errlist = fnvlist_alloc(); fnvlist_add_uint64(vdev_guids, path, guid); error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, secure, vdev_errlist); fnvlist_free(vdev_guids); fnvlist_free(vdev_errlist); switch (cmd) { case POOL_TRIM_CANCEL: if (ztest_opts.zo_verbose >= 4) { (void) printf("Cancel TRIM %s", path); if (!active) (void) printf(" failed (no TRIM active)"); (void) printf("\n"); } break; case POOL_TRIM_START: if (ztest_opts.zo_verbose >= 4) { (void) printf("Start TRIM %s", path); if (active && error == 0) (void) printf(" failed (already active)"); else if (error != 0) (void) printf(" failed (error %d)", error); (void) printf("\n"); } break; case POOL_TRIM_SUSPEND: if (ztest_opts.zo_verbose >= 4) { (void) printf("Suspend TRIM %s", path); if (!active) (void) printf(" failed (no TRIM active)"); (void) printf("\n"); } break; } free(path); mutex_exit(&ztest_vdev_lock); } /* * Verify pool integrity by running zdb. */ static void ztest_run_zdb(const char *pool) { int status; char *bin; char *zdb; char *zbuf; const int len = MAXPATHLEN + MAXNAMELEN + 20; FILE *fp; bin = umem_alloc(len, UMEM_NOFAIL); zdb = umem_alloc(len, UMEM_NOFAIL); zbuf = umem_alloc(1024, UMEM_NOFAIL); ztest_get_zdb_bin(bin, len); char **set_gvars_args = ztest_global_vars_to_zdb_args(); char *set_gvars_args_joined = join_strings(set_gvars_args, " "); free(set_gvars_args); size_t would = snprintf(zdb, len, "%s -bcc%s%s -G -d -Y -e -y %s -p %s %s", bin, ztest_opts.zo_verbose >= 3 ? "s" : "", ztest_opts.zo_verbose >= 4 ? "v" : "", set_gvars_args_joined, ztest_opts.zo_dir, pool); ASSERT3U(would, <, len); free(set_gvars_args_joined); if (ztest_opts.zo_verbose >= 5) (void) printf("Executing %s\n", zdb); fp = popen(zdb, "r"); while (fgets(zbuf, 1024, fp) != NULL) if (ztest_opts.zo_verbose >= 3) (void) printf("%s", zbuf); status = pclose(fp); if (status == 0) goto out; ztest_dump_core = 0; if (WIFEXITED(status)) fatal(B_FALSE, "'%s' exit code %d", zdb, WEXITSTATUS(status)); else fatal(B_FALSE, "'%s' died with signal %d", zdb, WTERMSIG(status)); out: umem_free(bin, len); umem_free(zdb, len); umem_free(zbuf, 1024); } static void ztest_walk_pool_directory(const char *header) { spa_t *spa = NULL; if (ztest_opts.zo_verbose >= 6) (void) puts(header); mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) if (ztest_opts.zo_verbose >= 6) (void) printf("\t%s\n", spa_name(spa)); mutex_exit(&spa_namespace_lock); } static void ztest_spa_import_export(char *oldname, char *newname) { nvlist_t *config, *newconfig; uint64_t pool_guid; spa_t *spa; int error; if (ztest_opts.zo_verbose >= 4) { (void) printf("import/export: old = %s, new = %s\n", oldname, newname); } /* * Clean up from previous runs. */ (void) spa_destroy(newname); /* * Get the pool's configuration and guid. */ VERIFY0(spa_open(oldname, &spa, FTAG)); /* * Kick off a scrub to tickle scrub/export races. */ if (ztest_random(2) == 0) (void) spa_scan(spa, POOL_SCAN_SCRUB); pool_guid = spa_guid(spa); spa_close(spa, FTAG); ztest_walk_pool_directory("pools before export"); /* * Export it. */ VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); ztest_walk_pool_directory("pools after export"); /* * Try to import it. */ newconfig = spa_tryimport(config); ASSERT3P(newconfig, !=, NULL); fnvlist_free(newconfig); /* * Import it under the new name. */ error = spa_import(newname, config, NULL, 0); if (error != 0) { dump_nvlist(config, 0); fatal(B_FALSE, "couldn't import pool %s as %s: error %u", oldname, newname, error); } ztest_walk_pool_directory("pools after import"); /* * Try to import it again -- should fail with EEXIST. */ VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); /* * Try to import it under a different name -- should fail with EEXIST. */ VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); /* * Verify that the pool is no longer visible under the old name. */ VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); /* * Verify that we can open and close the pool using the new name. */ VERIFY0(spa_open(newname, &spa, FTAG)); ASSERT3U(pool_guid, ==, spa_guid(spa)); spa_close(spa, FTAG); fnvlist_free(config); } static void ztest_resume(spa_t *spa) { if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) (void) printf("resuming from suspended state\n"); spa_vdev_state_enter(spa, SCL_NONE); vdev_clear(spa, NULL); (void) spa_vdev_state_exit(spa, NULL, 0); (void) zio_resume(spa); } static __attribute__((noreturn)) void ztest_resume_thread(void *arg) { spa_t *spa = arg; while (!ztest_exiting) { if (spa_suspended(spa)) ztest_resume(spa); (void) poll(NULL, 0, 100); /* * Periodically change the zfs_compressed_arc_enabled setting. */ if (ztest_random(10) == 0) zfs_compressed_arc_enabled = ztest_random(2); /* * Periodically change the zfs_abd_scatter_enabled setting. */ if (ztest_random(10) == 0) zfs_abd_scatter_enabled = ztest_random(2); } thread_exit(); } static __attribute__((noreturn)) void ztest_deadman_thread(void *arg) { ztest_shared_t *zs = arg; spa_t *spa = ztest_spa; hrtime_t delay, overdue, last_run = gethrtime(); delay = (zs->zs_thread_stop - zs->zs_thread_start) + MSEC2NSEC(zfs_deadman_synctime_ms); while (!ztest_exiting) { /* * Wait for the delay timer while checking occasionally * if we should stop. */ if (gethrtime() < last_run + delay) { (void) poll(NULL, 0, 1000); continue; } /* * If the pool is suspended then fail immediately. Otherwise, * check to see if the pool is making any progress. If * vdev_deadman() discovers that there hasn't been any recent * I/Os then it will end up aborting the tests. */ if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { fatal(B_FALSE, "aborting test after %lu seconds because " "pool has transitioned to a suspended state.", zfs_deadman_synctime_ms / 1000); } vdev_deadman(spa->spa_root_vdev, FTAG); /* * If the process doesn't complete within a grace period of * zfs_deadman_synctime_ms over the expected finish time, * then it may be hung and is terminated. */ overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); if (gethrtime() > overdue) { fatal(B_FALSE, "aborting test after %llu seconds because " "the process is overdue for termination.", (gethrtime() - zs->zs_proc_start) / NANOSEC); } (void) printf("ztest has been running for %lld seconds\n", (gethrtime() - zs->zs_proc_start) / NANOSEC); last_run = gethrtime(); delay = MSEC2NSEC(zfs_deadman_checktime_ms); } thread_exit(); } static void ztest_execute(int test, ztest_info_t *zi, uint64_t id) { ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); hrtime_t functime = gethrtime(); int i; for (i = 0; i < zi->zi_iters; i++) zi->zi_func(zd, id); functime = gethrtime() - functime; atomic_add_64(&zc->zc_count, 1); atomic_add_64(&zc->zc_time, functime); if (ztest_opts.zo_verbose >= 4) (void) printf("%6.2f sec in %s\n", (double)functime / NANOSEC, zi->zi_funcname); } static __attribute__((noreturn)) void ztest_thread(void *arg) { int rand; uint64_t id = (uintptr_t)arg; ztest_shared_t *zs = ztest_shared; uint64_t call_next; hrtime_t now; ztest_info_t *zi; ztest_shared_callstate_t *zc; while ((now = gethrtime()) < zs->zs_thread_stop) { /* * See if it's time to force a crash. */ if (now > zs->zs_thread_kill) ztest_kill(zs); /* * If we're getting ENOSPC with some regularity, stop. */ if (zs->zs_enospc_count > 10) break; /* * Pick a random function to execute. */ rand = ztest_random(ZTEST_FUNCS); zi = &ztest_info[rand]; zc = ZTEST_GET_SHARED_CALLSTATE(rand); call_next = zc->zc_next; if (now >= call_next && atomic_cas_64(&zc->zc_next, call_next, call_next + ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { ztest_execute(rand, zi, id); } } thread_exit(); } static void ztest_dataset_name(char *dsname, const char *pool, int d) { (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); } static void ztest_dataset_destroy(int d) { char name[ZFS_MAX_DATASET_NAME_LEN]; int t; ztest_dataset_name(name, ztest_opts.zo_pool, d); if (ztest_opts.zo_verbose >= 3) (void) printf("Destroying %s to free up space\n", name); /* * Cleanup any non-standard clones and snapshots. In general, * ztest thread t operates on dataset (t % zopt_datasets), * so there may be more than one thing to clean up. */ for (t = d; t < ztest_opts.zo_threads; t += ztest_opts.zo_datasets) ztest_dsl_dataset_cleanup(name, t); (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); } static void ztest_dataset_dirobj_verify(ztest_ds_t *zd) { uint64_t usedobjs, dirobjs, scratch; /* * ZTEST_DIROBJ is the object directory for the entire dataset. * Therefore, the number of objects in use should equal the * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. * If not, we have an object leak. * * Note that we can only check this in ztest_dataset_open(), * when the open-context and syncing-context values agree. * That's because zap_count() returns the open-context value, * while dmu_objset_space() returns the rootbp fill count. */ VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); ASSERT3U(dirobjs + 1, ==, usedobjs); } static int ztest_dataset_open(int d) { ztest_ds_t *zd = &ztest_ds[d]; uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; objset_t *os; zilog_t *zilog; char name[ZFS_MAX_DATASET_NAME_LEN]; int error; ztest_dataset_name(name, ztest_opts.zo_pool, d); (void) pthread_rwlock_rdlock(&ztest_name_lock); error = ztest_dataset_create(name); if (error == ENOSPC) { (void) pthread_rwlock_unlock(&ztest_name_lock); ztest_record_enospc(FTAG); return (error); } ASSERT(error == 0 || error == EEXIST); VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, zd, &os)); (void) pthread_rwlock_unlock(&ztest_name_lock); ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); zilog = zd->zd_zilog; if (zilog->zl_header->zh_claim_lr_seq != 0 && zilog->zl_header->zh_claim_lr_seq < committed_seq) fatal(B_FALSE, "missing log records: " "claimed %"PRIu64" < committed %"PRIu64"", zilog->zl_header->zh_claim_lr_seq, committed_seq); ztest_dataset_dirobj_verify(zd); zil_replay(os, zd, ztest_replay_vector); ztest_dataset_dirobj_verify(zd); if (ztest_opts.zo_verbose >= 6) (void) printf("%s replay %"PRIu64" blocks, " "%"PRIu64" records, seq %"PRIu64"\n", zd->zd_name, zilog->zl_parse_blk_count, zilog->zl_parse_lr_count, zilog->zl_replaying_seq); zilog = zil_open(os, ztest_get_data, NULL); if (zilog->zl_replaying_seq != 0 && zilog->zl_replaying_seq < committed_seq) fatal(B_FALSE, "missing log records: " "replayed %"PRIu64" < committed %"PRIu64"", zilog->zl_replaying_seq, committed_seq); return (0); } static void ztest_dataset_close(int d) { ztest_ds_t *zd = &ztest_ds[d]; zil_close(zd->zd_zilog); dmu_objset_disown(zd->zd_os, B_TRUE, zd); ztest_zd_fini(zd); } static int ztest_replay_zil_cb(const char *name, void *arg) { (void) arg; objset_t *os; ztest_ds_t *zdtmp; VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); ztest_zd_init(zdtmp, NULL, os); zil_replay(os, zdtmp, ztest_replay_vector); ztest_zd_fini(zdtmp); if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && ztest_opts.zo_verbose >= 6) { zilog_t *zilog = dmu_objset_zil(os); (void) printf("%s replay %"PRIu64" blocks, " "%"PRIu64" records, seq %"PRIu64"\n", name, zilog->zl_parse_blk_count, zilog->zl_parse_lr_count, zilog->zl_replaying_seq); } umem_free(zdtmp, sizeof (ztest_ds_t)); dmu_objset_disown(os, B_TRUE, FTAG); return (0); } static void ztest_freeze(void) { ztest_ds_t *zd = &ztest_ds[0]; spa_t *spa; int numloops = 0; if (ztest_opts.zo_verbose >= 3) (void) printf("testing spa_freeze()...\n"); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); VERIFY0(ztest_dataset_open(0)); ztest_spa = spa; /* * Force the first log block to be transactionally allocated. * We have to do this before we freeze the pool -- otherwise * the log chain won't be anchored. */ while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { ztest_dmu_object_alloc_free(zd, 0); zil_commit(zd->zd_zilog, 0); } txg_wait_synced(spa_get_dsl(spa), 0); /* * Freeze the pool. This stops spa_sync() from doing anything, * so that the only way to record changes from now on is the ZIL. */ spa_freeze(spa); /* * Because it is hard to predict how much space a write will actually * require beforehand, we leave ourselves some fudge space to write over * capacity. */ uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; /* * Run tests that generate log records but don't alter the pool config * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). * We do a txg_wait_synced() after each iteration to force the txg * to increase well beyond the last synced value in the uberblock. * The ZIL should be OK with that. * * Run a random number of times less than zo_maxloops and ensure we do * not run out of space on the pool. */ while (ztest_random(10) != 0 && numloops++ < ztest_opts.zo_maxloops && metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { ztest_od_t od; ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); ztest_io(zd, od.od_object, ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); txg_wait_synced(spa_get_dsl(spa), 0); } /* * Commit all of the changes we just generated. */ zil_commit(zd->zd_zilog, 0); txg_wait_synced(spa_get_dsl(spa), 0); /* * Close our dataset and close the pool. */ ztest_dataset_close(0); spa_close(spa, FTAG); kernel_fini(); /* * Open and close the pool and dataset to induce log replay. */ kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); VERIFY0(ztest_dataset_open(0)); ztest_spa = spa; txg_wait_synced(spa_get_dsl(spa), 0); ztest_dataset_close(0); ztest_reguid(NULL, 0); spa_close(spa, FTAG); kernel_fini(); } static void ztest_import_impl(void) { importargs_t args = { 0 }; nvlist_t *cfg = NULL; int nsearch = 1; char *searchdirs[nsearch]; int flags = ZFS_IMPORT_MISSING_LOG; searchdirs[0] = ztest_opts.zo_dir; args.paths = nsearch; args.path = searchdirs; args.can_be_active = B_FALSE; VERIFY0(zpool_find_config(NULL, ztest_opts.zo_pool, &cfg, &args, &libzpool_config_ops)); VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); fnvlist_free(cfg); } /* * Import a storage pool with the given name. */ static void ztest_import(ztest_shared_t *zs) { spa_t *spa; mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); ztest_import_impl(); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); zs->zs_metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; spa_close(spa, FTAG); kernel_fini(); if (!ztest_opts.zo_mmp_test) { ztest_run_zdb(ztest_opts.zo_pool); ztest_freeze(); ztest_run_zdb(ztest_opts.zo_pool); } (void) pthread_rwlock_destroy(&ztest_name_lock); mutex_destroy(&ztest_vdev_lock); mutex_destroy(&ztest_checkpoint_lock); } /* * Kick off threads to run tests on all datasets in parallel. */ static void ztest_run(ztest_shared_t *zs) { spa_t *spa; objset_t *os; kthread_t *resume_thread, *deadman_thread; kthread_t **run_threads; uint64_t object; int error; int t, d; ztest_exiting = B_FALSE; /* * Initialize parent/child shared state. */ mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); zs->zs_thread_start = gethrtime(); zs->zs_thread_stop = zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); zs->zs_thread_kill = zs->zs_thread_stop; if (ztest_random(100) < ztest_opts.zo_killrate) { zs->zs_thread_kill -= ztest_random(ztest_opts.zo_passtime * NANOSEC); } mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), offsetof(ztest_cb_data_t, zcd_node)); /* * Open our pool. It may need to be imported first depending on * what tests were running when the previous pass was terminated. */ kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); error = spa_open(ztest_opts.zo_pool, &spa, FTAG); if (error) { VERIFY3S(error, ==, ENOENT); ztest_import_impl(); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); zs->zs_metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; } metaslab_preload_limit = ztest_random(20) + 1; ztest_spa = spa; VERIFY0(vdev_raidz_impl_set("cycle")); dmu_objset_stats_t dds; VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); dmu_objset_fast_stat(os, &dds); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); zs->zs_guid = dds.dds_guid; dmu_objset_disown(os, B_TRUE, FTAG); /* * Create a thread to periodically resume suspended I/O. */ resume_thread = thread_create(NULL, 0, ztest_resume_thread, spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); /* * Create a deadman thread and set to panic if we hang. */ deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; /* * Verify that we can safely inquire about any object, * whether it's allocated or not. To make it interesting, * we probe a 5-wide window around each power of two. * This hits all edge cases, including zero and the max. */ for (t = 0; t < 64; t++) { for (d = -5; d <= 5; d++) { error = dmu_object_info(spa->spa_meta_objset, (1ULL << t) + d, NULL); ASSERT(error == 0 || error == ENOENT || error == EINVAL); } } /* * If we got any ENOSPC errors on the previous run, destroy something. */ if (zs->zs_enospc_count != 0) { int d = ztest_random(ztest_opts.zo_datasets); ztest_dataset_destroy(d); } zs->zs_enospc_count = 0; /* * If we were in the middle of ztest_device_removal() and were killed * we need to ensure the removal and scrub complete before running * any tests that check ztest_device_removal_active. The removal will * be restarted automatically when the spa is opened, but we need to * initiate the scrub manually if it is not already in progress. Note * that we always run the scrub whenever an indirect vdev exists * because we have no way of knowing for sure if ztest_device_removal() * fully completed its scrub before the pool was reimported. */ if (spa->spa_removing_phys.sr_state == DSS_SCANNING || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { while (spa->spa_removing_phys.sr_state == DSS_SCANNING) txg_wait_synced(spa_get_dsl(spa), 0); error = ztest_scrub_impl(spa); if (error == EBUSY) error = 0; ASSERT0(error); } run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), UMEM_NOFAIL); if (ztest_opts.zo_verbose >= 4) (void) printf("starting main threads...\n"); /* * Replay all logs of all datasets in the pool. This is primarily for * temporary datasets which wouldn't otherwise get replayed, which * can trigger failures when attempting to offline a SLOG in * ztest_fault_inject(). */ (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, NULL, DS_FIND_CHILDREN); /* * Kick off all the tests that run in parallel. */ for (t = 0; t < ztest_opts.zo_threads; t++) { if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); return; } run_threads[t] = thread_create(NULL, 0, ztest_thread, (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); } /* * Wait for all of the tests to complete. */ for (t = 0; t < ztest_opts.zo_threads; t++) VERIFY0(thread_join(run_threads[t])); /* * Close all datasets. This must be done after all the threads * are joined so we can be sure none of the datasets are in-use * by any of the threads. */ for (t = 0; t < ztest_opts.zo_threads; t++) { if (t < ztest_opts.zo_datasets) ztest_dataset_close(t); } txg_wait_synced(spa_get_dsl(spa), 0); zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); /* Kill the resume and deadman threads */ ztest_exiting = B_TRUE; VERIFY0(thread_join(resume_thread)); VERIFY0(thread_join(deadman_thread)); ztest_resume(spa); /* * Right before closing the pool, kick off a bunch of async I/O; * spa_close() should wait for it to complete. */ for (object = 1; object < 50; object++) { dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, ZIO_PRIORITY_SYNC_READ); } /* Verify that at least one commit cb was called in a timely fashion */ if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) VERIFY0(zc_min_txg_delay); spa_close(spa, FTAG); /* * Verify that we can loop over all pools. */ mutex_enter(&spa_namespace_lock); for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) if (ztest_opts.zo_verbose > 3) (void) printf("spa_next: found %s\n", spa_name(spa)); mutex_exit(&spa_namespace_lock); /* * Verify that we can export the pool and reimport it under a * different name. */ if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { char name[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(name, sizeof (name), "%s_import", ztest_opts.zo_pool); ztest_spa_import_export(ztest_opts.zo_pool, name); ztest_spa_import_export(name, ztest_opts.zo_pool); } kernel_fini(); list_destroy(&zcl.zcl_callbacks); mutex_destroy(&zcl.zcl_callbacks_lock); (void) pthread_rwlock_destroy(&ztest_name_lock); mutex_destroy(&ztest_vdev_lock); mutex_destroy(&ztest_checkpoint_lock); } static void print_time(hrtime_t t, char *timebuf) { hrtime_t s = t / NANOSEC; hrtime_t m = s / 60; hrtime_t h = m / 60; hrtime_t d = h / 24; s -= m * 60; m -= h * 60; h -= d * 24; timebuf[0] = '\0'; if (d) (void) sprintf(timebuf, "%llud%02lluh%02llum%02llus", d, h, m, s); else if (h) (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); else if (m) (void) sprintf(timebuf, "%llum%02llus", m, s); else (void) sprintf(timebuf, "%llus", s); } static nvlist_t * make_random_props(void) { nvlist_t *props; props = fnvlist_alloc(); if (ztest_random(2) == 0) return (props); fnvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); return (props); } /* * Create a storage pool with the given name and initial vdev size. * Then test spa_freeze() functionality. */ static void ztest_init(ztest_shared_t *zs) { spa_t *spa; nvlist_t *nvroot, *props; int i; mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); /* * Create the storage pool. */ (void) spa_destroy(ztest_opts.zo_pool); ztest_shared->zs_vdev_next_leaf = 0; zs->zs_splits = 0; zs->zs_mirrors = ztest_opts.zo_mirrors; nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); props = make_random_props(); /* * We don't expect the pool to suspend unless maxfaults == 0, * in which case ztest_fault_inject() temporarily takes away * the only valid replica. */ fnvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); for (i = 0; i < SPA_FEATURES; i++) { char *buf; if (!spa_feature_table[i].fi_zfs_mod_supported) continue; /* * 75% chance of using the log space map feature. We want ztest * to exercise both the code paths that use the log space map * feature and the ones that don't. */ if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) continue; VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", spa_feature_table[i].fi_uname)); fnvlist_add_uint64(props, buf, 0); free(buf); } VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); fnvlist_free(nvroot); fnvlist_free(props); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); zs->zs_metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; spa_close(spa, FTAG); kernel_fini(); if (!ztest_opts.zo_mmp_test) { ztest_run_zdb(ztest_opts.zo_pool); ztest_freeze(); ztest_run_zdb(ztest_opts.zo_pool); } (void) pthread_rwlock_destroy(&ztest_name_lock); mutex_destroy(&ztest_vdev_lock); mutex_destroy(&ztest_checkpoint_lock); } static void setup_data_fd(void) { static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; ztest_fd_data = mkstemp(ztest_name_data); ASSERT3S(ztest_fd_data, >=, 0); (void) unlink(ztest_name_data); } static int shared_data_size(ztest_shared_hdr_t *hdr) { int size; size = hdr->zh_hdr_size; size += hdr->zh_opts_size; size += hdr->zh_size; size += hdr->zh_stats_size * hdr->zh_stats_count; size += hdr->zh_ds_size * hdr->zh_ds_count; return (size); } static void setup_hdr(void) { int size; ztest_shared_hdr_t *hdr; hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); ASSERT3P(hdr, !=, MAP_FAILED); VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); hdr->zh_opts_size = sizeof (ztest_shared_opts_t); hdr->zh_size = sizeof (ztest_shared_t); hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); hdr->zh_stats_count = ZTEST_FUNCS; hdr->zh_ds_size = sizeof (ztest_shared_ds_t); hdr->zh_ds_count = ztest_opts.zo_datasets; size = shared_data_size(hdr); VERIFY0(ftruncate(ztest_fd_data, size)); (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); } static void setup_data(void) { int size, offset; ztest_shared_hdr_t *hdr; uint8_t *buf; hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), PROT_READ, MAP_SHARED, ztest_fd_data, 0); ASSERT3P(hdr, !=, MAP_FAILED); size = shared_data_size(hdr); (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); ASSERT3P(hdr, !=, MAP_FAILED); buf = (uint8_t *)hdr; offset = hdr->zh_hdr_size; ztest_shared_opts = (void *)&buf[offset]; offset += hdr->zh_opts_size; ztest_shared = (void *)&buf[offset]; offset += hdr->zh_size; ztest_shared_callstate = (void *)&buf[offset]; offset += hdr->zh_stats_size * hdr->zh_stats_count; ztest_shared_ds = (void *)&buf[offset]; } static boolean_t exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) { pid_t pid; int status; char *cmdbuf = NULL; pid = fork(); if (cmd == NULL) { cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); cmd = cmdbuf; } if (pid == -1) fatal(B_TRUE, "fork failed"); if (pid == 0) { /* child */ char fd_data_str[12]; VERIFY3S(11, >=, snprintf(fd_data_str, 12, "%d", ztest_fd_data)); VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); if (libpath != NULL) { const char *curlp = getenv("LD_LIBRARY_PATH"); if (curlp == NULL) VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); else { char *newlp = NULL; VERIFY3S(-1, !=, asprintf(&newlp, "%s:%s", libpath, curlp)); VERIFY0(setenv("LD_LIBRARY_PATH", newlp, 1)); free(newlp); } } (void) execl(cmd, cmd, (char *)NULL); ztest_dump_core = B_FALSE; fatal(B_TRUE, "exec failed: %s", cmd); } if (cmdbuf != NULL) { umem_free(cmdbuf, MAXPATHLEN); cmd = NULL; } while (waitpid(pid, &status, 0) != pid) continue; if (statusp != NULL) *statusp = status; if (WIFEXITED(status)) { if (WEXITSTATUS(status) != 0) { (void) fprintf(stderr, "child exited with code %d\n", WEXITSTATUS(status)); exit(2); } return (B_FALSE); } else if (WIFSIGNALED(status)) { if (!ignorekill || WTERMSIG(status) != SIGKILL) { (void) fprintf(stderr, "child died with signal %d\n", WTERMSIG(status)); exit(3); } return (B_TRUE); } else { (void) fprintf(stderr, "something strange happened to child\n"); exit(4); } } static void ztest_run_init(void) { int i; ztest_shared_t *zs = ztest_shared; /* * Blow away any existing copy of zpool.cache */ (void) remove(spa_config_path); if (ztest_opts.zo_init == 0) { if (ztest_opts.zo_verbose >= 1) (void) printf("Importing pool %s\n", ztest_opts.zo_pool); ztest_import(zs); return; } /* * Create and initialize our storage pool. */ for (i = 1; i <= ztest_opts.zo_init; i++) { memset(zs, 0, sizeof (*zs)); if (ztest_opts.zo_verbose >= 3 && ztest_opts.zo_init != 1) { (void) printf("ztest_init(), pass %d\n", i); } ztest_init(zs); } } int main(int argc, char **argv) { int kills = 0; int iters = 0; int older = 0; int newer = 0; ztest_shared_t *zs; ztest_info_t *zi; ztest_shared_callstate_t *zc; char timebuf[100]; char numbuf[NN_NUMBUF_SZ]; char *cmd; boolean_t hasalt; int f, err; char *fd_data_str = getenv("ZTEST_FD_DATA"); struct sigaction action; (void) setvbuf(stdout, NULL, _IOLBF, 0); dprintf_setup(&argc, argv); zfs_deadman_synctime_ms = 300000; zfs_deadman_checktime_ms = 30000; /* * As two-word space map entries may not come up often (especially * if pool and vdev sizes are small) we want to force at least some * of them so the feature get tested. */ zfs_force_some_double_word_sm_entries = B_TRUE; /* * Verify that even extensively damaged split blocks with many * segments can be reconstructed in a reasonable amount of time * when reconstruction is known to be possible. * * Note: the lower this value is, the more damage we inflict, and * the more time ztest spends in recovering that damage. We chose * to induce damage 1/100th of the time so recovery is tested but * not so frequently that ztest doesn't get to test other code paths. */ zfs_reconstruct_indirect_damage_fraction = 100; action.sa_handler = sig_handler; sigemptyset(&action.sa_mask); action.sa_flags = 0; if (sigaction(SIGSEGV, &action, NULL) < 0) { (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", strerror(errno)); exit(EXIT_FAILURE); } if (sigaction(SIGABRT, &action, NULL) < 0) { (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", strerror(errno)); exit(EXIT_FAILURE); } /* * Force random_get_bytes() to use /dev/urandom in order to prevent * ztest from needlessly depleting the system entropy pool. */ random_path = "/dev/urandom"; ztest_fd_rand = open(random_path, O_RDONLY | O_CLOEXEC); ASSERT3S(ztest_fd_rand, >=, 0); if (!fd_data_str) { process_options(argc, argv); setup_data_fd(); setup_hdr(); setup_data(); memcpy(ztest_shared_opts, &ztest_opts, sizeof (*ztest_shared_opts)); } else { ztest_fd_data = atoi(fd_data_str); setup_data(); memcpy(&ztest_opts, ztest_shared_opts, sizeof (ztest_opts)); } ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); err = ztest_set_global_vars(); if (err != 0 && !fd_data_str) { /* error message done by ztest_set_global_vars */ exit(EXIT_FAILURE); } else { /* children should not be spawned if setting gvars fails */ VERIFY3S(err, ==, 0); } /* Override location of zpool.cache */ VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", ztest_opts.zo_dir), !=, -1); ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), UMEM_NOFAIL); zs = ztest_shared; if (fd_data_str) { metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; metaslab_df_alloc_threshold = zs->zs_metaslab_df_alloc_threshold; if (zs->zs_do_init) ztest_run_init(); else ztest_run(zs); exit(0); } hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); if (ztest_opts.zo_verbose >= 1) { (void) printf("%"PRIu64" vdevs, %d datasets, %d threads," "%d %s disks, %"PRIu64" seconds...\n\n", ztest_opts.zo_vdevs, ztest_opts.zo_datasets, ztest_opts.zo_threads, ztest_opts.zo_raid_children, ztest_opts.zo_raid_type, ztest_opts.zo_time); } cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); (void) strlcpy(cmd, getexecname(), MAXNAMELEN); zs->zs_do_init = B_TRUE; if (strlen(ztest_opts.zo_alt_ztest) != 0) { if (ztest_opts.zo_verbose >= 1) { (void) printf("Executing older ztest for " "initialization: %s\n", ztest_opts.zo_alt_ztest); } VERIFY(!exec_child(ztest_opts.zo_alt_ztest, ztest_opts.zo_alt_libpath, B_FALSE, NULL)); } else { VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); } zs->zs_do_init = B_FALSE; zs->zs_proc_start = gethrtime(); zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; for (f = 0; f < ZTEST_FUNCS; f++) { zi = &ztest_info[f]; zc = ZTEST_GET_SHARED_CALLSTATE(f); if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) zc->zc_next = UINT64_MAX; else zc->zc_next = zs->zs_proc_start + ztest_random(2 * zi->zi_interval[0] + 1); } /* * Run the tests in a loop. These tests include fault injection * to verify that self-healing data works, and forced crashes * to verify that we never lose on-disk consistency. */ while (gethrtime() < zs->zs_proc_stop) { int status; boolean_t killed; /* * Initialize the workload counters for each function. */ for (f = 0; f < ZTEST_FUNCS; f++) { zc = ZTEST_GET_SHARED_CALLSTATE(f); zc->zc_count = 0; zc->zc_time = 0; } /* Set the allocation switch size */ zs->zs_metaslab_df_alloc_threshold = ztest_random(zs->zs_metaslab_sz / 4) + 1; if (!hasalt || ztest_random(2) == 0) { if (hasalt && ztest_opts.zo_verbose >= 1) { (void) printf("Executing newer ztest: %s\n", cmd); } newer++; killed = exec_child(cmd, NULL, B_TRUE, &status); } else { if (hasalt && ztest_opts.zo_verbose >= 1) { (void) printf("Executing older ztest: %s\n", ztest_opts.zo_alt_ztest); } older++; killed = exec_child(ztest_opts.zo_alt_ztest, ztest_opts.zo_alt_libpath, B_TRUE, &status); } if (killed) kills++; iters++; if (ztest_opts.zo_verbose >= 1) { hrtime_t now = gethrtime(); now = MIN(now, zs->zs_proc_stop); print_time(zs->zs_proc_stop - now, timebuf); nicenum(zs->zs_space, numbuf, sizeof (numbuf)); (void) printf("Pass %3d, %8s, %3"PRIu64" ENOSPC, " "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", iters, WIFEXITED(status) ? "Complete" : "SIGKILL", zs->zs_enospc_count, 100.0 * zs->zs_alloc / zs->zs_space, numbuf, 100.0 * (now - zs->zs_proc_start) / (ztest_opts.zo_time * NANOSEC), timebuf); } if (ztest_opts.zo_verbose >= 2) { (void) printf("\nWorkload summary:\n\n"); (void) printf("%7s %9s %s\n", "Calls", "Time", "Function"); (void) printf("%7s %9s %s\n", "-----", "----", "--------"); for (f = 0; f < ZTEST_FUNCS; f++) { zi = &ztest_info[f]; zc = ZTEST_GET_SHARED_CALLSTATE(f); print_time(zc->zc_time, timebuf); (void) printf("%7"PRIu64" %9s %s\n", zc->zc_count, timebuf, zi->zi_funcname); } (void) printf("\n"); } if (!ztest_opts.zo_mmp_test) ztest_run_zdb(ztest_opts.zo_pool); } if (ztest_opts.zo_verbose >= 1) { if (hasalt) { (void) printf("%d runs of older ztest: %s\n", older, ztest_opts.zo_alt_ztest); (void) printf("%d runs of newer ztest: %s\n", newer, cmd); } (void) printf("%d killed, %d completed, %.0f%% kill rate\n", kills, iters - kills, (100.0 * kills) / MAX(1, iters)); } umem_free(cmd, MAXNAMELEN); return (0); } diff --git a/config/Rules.am b/config/Rules.am index 7162b771869d..abb4ced33233 100644 --- a/config/Rules.am +++ b/config/Rules.am @@ -1,83 +1,85 @@ # # Default build rules for all user space components, every Makefile.am # should include these rules and override or extend them as needed. # PHONY = AM_CPPFLAGS = \ -include $(top_builddir)/zfs_config.h \ -I$(top_builddir)/include \ -I$(top_srcdir)/include \ -I$(top_srcdir)/module/icp/include \ -I$(top_srcdir)/lib/libspl/include \ -I$(top_srcdir)/lib/libspl/include/os/@ac_system_l@ AM_LIBTOOLFLAGS = --silent AM_CFLAGS = -std=gnu99 -Wall -Wextra -Wstrict-prototypes -Wmissing-prototypes -Wwrite-strings -Wno-sign-compare -Wno-missing-field-initializers AM_CFLAGS += -fno-strict-aliasing AM_CFLAGS += $(NO_OMIT_FRAME_POINTER) AM_CFLAGS += $(IMPLICIT_FALLTHROUGH) AM_CFLAGS += $(DEBUG_CFLAGS) AM_CFLAGS += $(ASAN_CFLAGS) AM_CFLAGS += $(UBSAN_CFLAGS) AM_CFLAGS += $(CODE_COVERAGE_CFLAGS) $(NO_FORMAT_ZERO_LENGTH) if BUILD_FREEBSD AM_CFLAGS += -fPIC -Werror -Wno-unknown-pragmas -Wno-enum-conversion AM_CFLAGS += -include $(top_srcdir)/include/os/freebsd/spl/sys/ccompile.h AM_CFLAGS += -I/usr/include -I/usr/local/include endif AM_CPPFLAGS += -D_GNU_SOURCE AM_CPPFLAGS += -D_REENTRANT AM_CPPFLAGS += -D_FILE_OFFSET_BITS=64 AM_CPPFLAGS += -D_LARGEFILE64_SOURCE AM_CPPFLAGS += -DLIBEXECDIR=\"$(libexecdir)\" AM_CPPFLAGS += -DRUNSTATEDIR=\"$(runstatedir)\" AM_CPPFLAGS += -DSBINDIR=\"$(sbindir)\" AM_CPPFLAGS += -DSYSCONFDIR=\"$(sysconfdir)\" AM_CPPFLAGS += -DPKGDATADIR=\"$(pkgdatadir)\" AM_CPPFLAGS += $(DEBUG_CPPFLAGS) AM_CPPFLAGS += $(CODE_COVERAGE_CPPFLAGS) AM_CPPFLAGS += -DTEXT_DOMAIN=\"zfs-@ac_system_l@-user\" AM_CPPFLAGS_NOCHECK = -D"strtok(...)=strtok(__VA_ARGS__) __attribute__((deprecated(\"Use strtok_r(3) instead!\")))" AM_CPPFLAGS_NOCHECK += -D"__xpg_basename(...)=__xpg_basename(__VA_ARGS__) __attribute__((deprecated(\"basename(3) is underspecified. Use zfs_basename() instead!\")))" AM_CPPFLAGS_NOCHECK += -D"basename(...)=basename(__VA_ARGS__) __attribute__((deprecated(\"basename(3) is underspecified. Use zfs_basename() instead!\")))" AM_CPPFLAGS_NOCHECK += -D"dirname(...)=dirname(__VA_ARGS__) __attribute__((deprecated(\"dirname(3) is underspecified. Use zfs_dirnamelen() instead!\")))" AM_CPPFLAGS_NOCHECK += -D"bcopy(...)=__attribute__((deprecated(\"bcopy(3) is deprecated. Use memcpy(3)/memmove(3) instead!\"))) bcopy(__VA_ARGS__)" AM_CPPFLAGS_NOCHECK += -D"bcmp(...)=__attribute__((deprecated(\"bcmp(3) is deprecated. Use memcmp(3) instead!\"))) bcmp(__VA_ARGS__)" AM_CPPFLAGS_NOCHECK += -D"bzero(...)=__attribute__((deprecated(\"bzero(3) is deprecated. Use memset(3) instead!\"))) bzero(__VA_ARGS__)" AM_CPPFLAGS_NOCHECK += -D"asctime(...)=__attribute__((deprecated(\"Use strftime(3) instead!\"))) asctime(__VA_ARGS__)" AM_CPPFLAGS_NOCHECK += -D"asctime_r(...)=__attribute__((deprecated(\"Use strftime(3) instead!\"))) asctime_r(__VA_ARGS__)" AM_CPPFLAGS_NOCHECK += -D"gmtime(...)=__attribute__((deprecated(\"gmtime(3) isn't thread-safe. Use gmtime_r(3) instead!\"))) gmtime(__VA_ARGS__)" AM_CPPFLAGS_NOCHECK += -D"localtime(...)=__attribute__((deprecated(\"localtime(3) isn't thread-safe. Use localtime_r(3) instead!\"))) localtime(__VA_ARGS__)" +AM_CPPFLAGS_NOCHECK += -D"strncpy(...)=__attribute__((deprecated(\"strncpy(3) is deprecated. Use strlcpy(3) instead!\"))) strncpy(__VA_ARGS__)" + AM_CPPFLAGS += $(AM_CPPFLAGS_NOCHECK) if ASAN_ENABLED AM_CPPFLAGS += -DZFS_ASAN_ENABLED endif if UBSAN_ENABLED AM_CPPFLAGS += -DZFS_UBSAN_ENABLED endif AM_LDFLAGS = $(DEBUG_LDFLAGS) AM_LDFLAGS += $(ASAN_LDFLAGS) AM_LDFLAGS += $(UBSAN_LDFLAGS) if BUILD_FREEBSD AM_LDFLAGS += -fstack-protector-strong -shared AM_LDFLAGS += -Wl,-x -Wl,--fatal-warnings -Wl,--warn-shared-textrel AM_LDFLAGS += -lm endif # If a target includes kernel code, generate warnings for large stack frames KERNEL_CFLAGS = $(FRAME_LARGER_THAN) # See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 LIBRARY_CFLAGS = -no-suppress # Forcibly enable asserts/debugging for libzpool &al. FORCEDEBUG_CPPFLAGS = -DDEBUG -UNDEBUG -DZFS_DEBUG diff --git a/include/os/freebsd/spl/sys/kstat.h b/include/os/freebsd/spl/sys/kstat.h index 947dfee62393..7dc2c4753b02 100644 --- a/include/os/freebsd/spl/sys/kstat.h +++ b/include/os/freebsd/spl/sys/kstat.h @@ -1,230 +1,230 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #ifndef _SPL_KSTAT_H #define _SPL_KSTAT_H #include #ifndef _STANDALONE #include #endif struct list_head {}; #include #include #define KSTAT_STRLEN 255 #define KSTAT_RAW_MAX (128*1024) /* * For reference valid classes are: * disk, tape, net, controller, vm, kvm, hat, streams, kstat, misc */ #define KSTAT_TYPE_RAW 0 /* can be anything; ks_ndata >= 1 */ #define KSTAT_TYPE_NAMED 1 /* name/value pair; ks_ndata >= 1 */ #define KSTAT_TYPE_INTR 2 /* interrupt stats; ks_ndata == 1 */ #define KSTAT_TYPE_IO 3 /* I/O stats; ks_ndata == 1 */ #define KSTAT_TYPE_TIMER 4 /* event timer; ks_ndata >= 1 */ #define KSTAT_NUM_TYPES 5 #define KSTAT_DATA_CHAR 0 #define KSTAT_DATA_INT32 1 #define KSTAT_DATA_UINT32 2 #define KSTAT_DATA_INT64 3 #define KSTAT_DATA_UINT64 4 #define KSTAT_DATA_LONG 5 #define KSTAT_DATA_ULONG 6 #define KSTAT_DATA_STRING 7 #define KSTAT_NUM_DATAS 8 #define KSTAT_INTR_HARD 0 #define KSTAT_INTR_SOFT 1 #define KSTAT_INTR_WATCHDOG 2 #define KSTAT_INTR_SPURIOUS 3 #define KSTAT_INTR_MULTSVC 4 #define KSTAT_NUM_INTRS 5 #define KSTAT_FLAG_VIRTUAL 0x01 #define KSTAT_FLAG_VAR_SIZE 0x02 #define KSTAT_FLAG_WRITABLE 0x04 #define KSTAT_FLAG_PERSISTENT 0x08 #define KSTAT_FLAG_DORMANT 0x10 #define KSTAT_FLAG_INVALID 0x20 #define KSTAT_FLAG_LONGSTRINGS 0x40 #define KSTAT_FLAG_NO_HEADERS 0x80 #define KS_MAGIC 0x9d9d9d9d /* Dynamic updates */ #define KSTAT_READ 0 #define KSTAT_WRITE 1 struct kstat_s; typedef struct kstat_s kstat_t; typedef int kid_t; /* unique kstat id */ typedef int kstat_update_t(struct kstat_s *, int); /* dynamic update cb */ struct seq_file { char *sf_buf; size_t sf_size; }; void seq_printf(struct seq_file *m, const char *fmt, ...); typedef struct kstat_module { - char ksm_name[KSTAT_STRLEN+1]; /* module name */ + char ksm_name[KSTAT_STRLEN]; /* module name */ struct list_head ksm_module_list; /* module linkage */ struct list_head ksm_kstat_list; /* list of kstat entries */ struct proc_dir_entry *ksm_proc; /* proc entry */ } kstat_module_t; typedef struct kstat_raw_ops { int (*headers)(char *buf, size_t size); int (*seq_headers)(struct seq_file *); int (*data)(char *buf, size_t size, void *data); void *(*addr)(kstat_t *ksp, loff_t index); } kstat_raw_ops_t; struct kstat_s { int ks_magic; /* magic value */ kid_t ks_kid; /* unique kstat ID */ hrtime_t ks_crtime; /* creation time */ hrtime_t ks_snaptime; /* last access time */ - char ks_module[KSTAT_STRLEN+1]; /* provider module name */ + char ks_module[KSTAT_STRLEN]; /* provider module name */ int ks_instance; /* provider module instance */ - char ks_name[KSTAT_STRLEN+1]; /* kstat name */ - char ks_class[KSTAT_STRLEN+1]; /* kstat class */ + char ks_name[KSTAT_STRLEN]; /* kstat name */ + char ks_class[KSTAT_STRLEN]; /* kstat class */ uchar_t ks_type; /* kstat data type */ uchar_t ks_flags; /* kstat flags */ void *ks_data; /* kstat type-specific data */ uint_t ks_ndata; /* # of data records */ size_t ks_data_size; /* size of kstat data section */ kstat_update_t *ks_update; /* dynamic updates */ void *ks_private; /* private data */ void *ks_private1; /* private data */ kmutex_t ks_private_lock; /* kstat private data lock */ kmutex_t *ks_lock; /* kstat data lock */ struct list_head ks_list; /* kstat linkage */ kstat_module_t *ks_owner; /* kstat module linkage */ kstat_raw_ops_t ks_raw_ops; /* ops table for raw type */ char *ks_raw_buf; /* buf used for raw ops */ size_t ks_raw_bufsize; /* size of raw ops buffer */ #ifndef _STANDALONE struct sysctl_ctx_list ks_sysctl_ctx; struct sysctl_oid *ks_sysctl_root; #endif /* _STANDALONE */ }; typedef struct kstat_named_s { char name[KSTAT_STRLEN]; /* name of counter */ uchar_t data_type; /* data type */ union { char c[16]; /* 128-bit int */ int32_t i32; /* 32-bit signed int */ uint32_t ui32; /* 32-bit unsigned int */ int64_t i64; /* 64-bit signed int */ uint64_t ui64; /* 64-bit unsigned int */ long l; /* native signed long */ ulong_t ul; /* native unsigned long */ struct { union { char *ptr; /* NULL-term string */ char __pad[8]; /* 64-bit padding */ } addr; uint32_t len; /* # bytes for strlen + '\0' */ } string; } value; } kstat_named_t; #define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.string.addr.ptr) #define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.string.len) typedef struct kstat_intr { uint_t intrs[KSTAT_NUM_INTRS]; } kstat_intr_t; typedef struct kstat_io { u_longlong_t nread; /* number of bytes read */ u_longlong_t nwritten; /* number of bytes written */ uint_t reads; /* number of read operations */ uint_t writes; /* number of write operations */ hrtime_t wtime; /* cumulative wait (pre-service) time */ hrtime_t wlentime; /* cumulative wait len*time product */ hrtime_t wlastupdate; /* last time wait queue changed */ hrtime_t rtime; /* cumulative run (service) time */ hrtime_t rlentime; /* cumulative run length*time product */ hrtime_t rlastupdate; /* last time run queue changed */ uint_t wcnt; /* count of elements in wait state */ uint_t rcnt; /* count of elements in run state */ } kstat_io_t; typedef struct kstat_timer { - char name[KSTAT_STRLEN+1]; /* event name */ + char name[KSTAT_STRLEN]; /* event name */ u_longlong_t num_events; /* number of events */ hrtime_t elapsed_time; /* cumulative elapsed time */ hrtime_t min_time; /* shortest event duration */ hrtime_t max_time; /* longest event duration */ hrtime_t start_time; /* previous event start time */ hrtime_t stop_time; /* previous event stop time */ } kstat_timer_t; int spl_kstat_init(void); void spl_kstat_fini(void); extern void __kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), int (*data)(char *buf, size_t size, void *data), void* (*addr)(kstat_t *ksp, loff_t index)); extern void __kstat_set_seq_raw_ops(kstat_t *ksp, int (*headers)(struct seq_file *), int (*data)(char *buf, size_t size, void *data), void* (*addr)(kstat_t *ksp, loff_t index)); extern kstat_t *__kstat_create(const char *ks_module, int ks_instance, const char *ks_name, const char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags); extern void __kstat_install(kstat_t *ksp); extern void __kstat_delete(kstat_t *ksp); #define kstat_set_seq_raw_ops(k, h, d, a) \ __kstat_set_seq_raw_ops(k, h, d, a) #define kstat_set_raw_ops(k, h, d, a) \ __kstat_set_raw_ops(k, h, d, a) #ifndef _STANDALONE #define kstat_create(m, i, n, c, t, s, f) \ __kstat_create(m, i, n, c, t, s, f) #define kstat_install(k) __kstat_install(k) #define kstat_delete(k) __kstat_delete(k) #else #define kstat_create(m, i, n, c, t, s, f) ((kstat_t *)0) #define kstat_install(k) #define kstat_delete(k) #endif #endif /* _SPL_KSTAT_H */ diff --git a/include/os/linux/spl/sys/kstat.h b/include/os/linux/spl/sys/kstat.h index 928f70757545..305c411ddfa0 100644 --- a/include/os/linux/spl/sys/kstat.h +++ b/include/os/linux/spl/sys/kstat.h @@ -1,218 +1,218 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #ifndef _SPL_KSTAT_H #define _SPL_KSTAT_H #include #include #include #include #include #include #define KSTAT_STRLEN 255 #define KSTAT_RAW_MAX (128*1024) /* * For reference valid classes are: * disk, tape, net, controller, vm, kvm, hat, streams, kstat, misc */ #define KSTAT_TYPE_RAW 0 /* can be anything; ks_ndata >= 1 */ #define KSTAT_TYPE_NAMED 1 /* name/value pair; ks_ndata >= 1 */ #define KSTAT_TYPE_INTR 2 /* interrupt stats; ks_ndata == 1 */ #define KSTAT_TYPE_IO 3 /* I/O stats; ks_ndata == 1 */ #define KSTAT_TYPE_TIMER 4 /* event timer; ks_ndata >= 1 */ #define KSTAT_NUM_TYPES 5 #define KSTAT_DATA_CHAR 0 #define KSTAT_DATA_INT32 1 #define KSTAT_DATA_UINT32 2 #define KSTAT_DATA_INT64 3 #define KSTAT_DATA_UINT64 4 #define KSTAT_DATA_LONG 5 #define KSTAT_DATA_ULONG 6 #define KSTAT_DATA_STRING 7 #define KSTAT_NUM_DATAS 8 #define KSTAT_INTR_HARD 0 #define KSTAT_INTR_SOFT 1 #define KSTAT_INTR_WATCHDOG 2 #define KSTAT_INTR_SPURIOUS 3 #define KSTAT_INTR_MULTSVC 4 #define KSTAT_NUM_INTRS 5 #define KSTAT_FLAG_VIRTUAL 0x01 #define KSTAT_FLAG_VAR_SIZE 0x02 #define KSTAT_FLAG_WRITABLE 0x04 #define KSTAT_FLAG_PERSISTENT 0x08 #define KSTAT_FLAG_DORMANT 0x10 #define KSTAT_FLAG_INVALID 0x20 #define KSTAT_FLAG_LONGSTRINGS 0x40 #define KSTAT_FLAG_NO_HEADERS 0x80 #define KS_MAGIC 0x9d9d9d9d /* Dynamic updates */ #define KSTAT_READ 0 #define KSTAT_WRITE 1 struct kstat_s; typedef struct kstat_s kstat_t; typedef int kid_t; /* unique kstat id */ typedef int kstat_update_t(struct kstat_s *, int); /* dynamic update cb */ typedef struct kstat_module { - char ksm_name[KSTAT_STRLEN+1]; /* module name */ + char ksm_name[KSTAT_STRLEN]; /* module name */ struct list_head ksm_module_list; /* module linkage */ struct list_head ksm_kstat_list; /* list of kstat entries */ struct proc_dir_entry *ksm_proc; /* proc entry */ } kstat_module_t; typedef struct kstat_raw_ops { int (*headers)(char *buf, size_t size); int (*data)(char *buf, size_t size, void *data); void *(*addr)(kstat_t *ksp, loff_t index); } kstat_raw_ops_t; typedef struct kstat_proc_entry { - char kpe_name[KSTAT_STRLEN+1]; /* kstat name */ - char kpe_module[KSTAT_STRLEN+1]; /* provider module name */ + char kpe_name[KSTAT_STRLEN]; /* kstat name */ + char kpe_module[KSTAT_STRLEN]; /* provider module name */ kstat_module_t *kpe_owner; /* kstat module linkage */ struct list_head kpe_list; /* kstat linkage */ struct proc_dir_entry *kpe_proc; /* procfs entry */ } kstat_proc_entry_t; struct kstat_s { int ks_magic; /* magic value */ kid_t ks_kid; /* unique kstat ID */ hrtime_t ks_crtime; /* creation time */ hrtime_t ks_snaptime; /* last access time */ int ks_instance; /* provider module instance */ - char ks_class[KSTAT_STRLEN+1]; /* kstat class */ + char ks_class[KSTAT_STRLEN]; /* kstat class */ uchar_t ks_type; /* kstat data type */ uchar_t ks_flags; /* kstat flags */ void *ks_data; /* kstat type-specific data */ uint_t ks_ndata; /* # of data records */ size_t ks_data_size; /* size of kstat data section */ kstat_update_t *ks_update; /* dynamic updates */ void *ks_private; /* private data */ kmutex_t ks_private_lock; /* kstat private data lock */ kmutex_t *ks_lock; /* kstat data lock */ kstat_raw_ops_t ks_raw_ops; /* ops table for raw type */ char *ks_raw_buf; /* buf used for raw ops */ size_t ks_raw_bufsize; /* size of raw ops buffer */ kstat_proc_entry_t ks_proc; /* data for procfs entry */ }; typedef struct kstat_named_s { char name[KSTAT_STRLEN]; /* name of counter */ uchar_t data_type; /* data type */ union { char c[16]; /* 128-bit int */ int32_t i32; /* 32-bit signed int */ uint32_t ui32; /* 32-bit unsigned int */ int64_t i64; /* 64-bit signed int */ uint64_t ui64; /* 64-bit unsigned int */ long l; /* native signed long */ ulong_t ul; /* native unsigned long */ struct { union { char *ptr; /* NULL-term string */ char __pad[8]; /* 64-bit padding */ } addr; uint32_t len; /* # bytes for strlen + '\0' */ } string; } value; } kstat_named_t; #define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.string.addr.ptr) #define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.string.len) #ifdef HAVE_PROC_OPS_STRUCT typedef struct proc_ops kstat_proc_op_t; #else typedef struct file_operations kstat_proc_op_t; #endif typedef struct kstat_intr { uint_t intrs[KSTAT_NUM_INTRS]; } kstat_intr_t; typedef struct kstat_io { u_longlong_t nread; /* number of bytes read */ u_longlong_t nwritten; /* number of bytes written */ uint_t reads; /* number of read operations */ uint_t writes; /* number of write operations */ hrtime_t wtime; /* cumulative wait (pre-service) time */ hrtime_t wlentime; /* cumulative wait len*time product */ hrtime_t wlastupdate; /* last time wait queue changed */ hrtime_t rtime; /* cumulative run (service) time */ hrtime_t rlentime; /* cumulative run length*time product */ hrtime_t rlastupdate; /* last time run queue changed */ uint_t wcnt; /* count of elements in wait state */ uint_t rcnt; /* count of elements in run state */ } kstat_io_t; typedef struct kstat_timer { - char name[KSTAT_STRLEN+1]; /* event name */ + char name[KSTAT_STRLEN]; /* event name */ u_longlong_t num_events; /* number of events */ hrtime_t elapsed_time; /* cumulative elapsed time */ hrtime_t min_time; /* shortest event duration */ hrtime_t max_time; /* longest event duration */ hrtime_t start_time; /* previous event start time */ hrtime_t stop_time; /* previous event stop time */ } kstat_timer_t; int spl_kstat_init(void); void spl_kstat_fini(void); extern void __kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), int (*data)(char *buf, size_t size, void *data), void* (*addr)(kstat_t *ksp, loff_t index)); extern kstat_t *__kstat_create(const char *ks_module, int ks_instance, const char *ks_name, const char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags); extern void kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module, const char *name); extern void kstat_proc_entry_delete(kstat_proc_entry_t *kpep); extern void kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode, const kstat_proc_op_t *file_ops, void *data); extern void __kstat_install(kstat_t *ksp); extern void __kstat_delete(kstat_t *ksp); #define kstat_set_raw_ops(k, h, d, a) \ __kstat_set_raw_ops(k, h, d, a) #define kstat_create(m, i, n, c, t, s, f) \ __kstat_create(m, i, n, c, t, s, f) #define kstat_install(k) __kstat_install(k) #define kstat_delete(k) __kstat_delete(k) #endif /* _SPL_KSTAT_H */ diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 29798af0371e..3288268fb258 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -1,5590 +1,5590 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2019 Joyent, Inc. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . * Copyright (c) 2013 Martin Matuska. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright 2016 Igor Kozhukhov * Copyright 2017-2018 RackTop Systems. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, loli10K * Copyright (c) 2021 Matt Fiddaman */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_IDMAP #include #include #include #endif /* HAVE_IDMAP */ #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "libzfs_impl.h" #include "zfs_deleg.h" static int userquota_propname_decode(const char *propname, boolean_t zoned, zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp); /* * Given a single type (not a mask of types), return the type in a human * readable form. */ const char * zfs_type_to_name(zfs_type_t type) { switch (type) { case ZFS_TYPE_FILESYSTEM: return (dgettext(TEXT_DOMAIN, "filesystem")); case ZFS_TYPE_SNAPSHOT: return (dgettext(TEXT_DOMAIN, "snapshot")); case ZFS_TYPE_VOLUME: return (dgettext(TEXT_DOMAIN, "volume")); case ZFS_TYPE_POOL: return (dgettext(TEXT_DOMAIN, "pool")); case ZFS_TYPE_BOOKMARK: return (dgettext(TEXT_DOMAIN, "bookmark")); default: assert(!"unhandled zfs_type_t"); } return (NULL); } /* * Validate a ZFS path. This is used even before trying to open the dataset, to * provide a more meaningful error message. We call zfs_error_aux() to * explain exactly why the name was not valid. */ int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, boolean_t modifying) { namecheck_err_t why; char what; if (!(type & ZFS_TYPE_SNAPSHOT) && strchr(path, '@') != NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshot delimiter '@' is not expected here")); return (0); } if (type == ZFS_TYPE_SNAPSHOT && strchr(path, '@') == NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing '@' delimiter in snapshot name")); return (0); } if (!(type & ZFS_TYPE_BOOKMARK) && strchr(path, '#') != NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "bookmark delimiter '#' is not expected here")); return (0); } if (type == ZFS_TYPE_BOOKMARK && strchr(path, '#') == NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing '#' delimiter in bookmark name")); return (0); } if (modifying && strchr(path, '%') != NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid character %c in name"), '%'); return (0); } if (entity_namecheck(path, &why, &what) != 0) { if (hdl != NULL) { switch (why) { case NAME_ERR_TOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is too long")); break; case NAME_ERR_LEADING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "leading slash in name")); break; case NAME_ERR_EMPTY_COMPONENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "empty component or misplaced '@'" " or '#' delimiter in name")); break; case NAME_ERR_TRAILING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "trailing slash in name")); break; case NAME_ERR_INVALCHAR: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid character " "'%c' in name"), what); break; case NAME_ERR_MULTIPLE_DELIMITERS: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "multiple '@' and/or '#' delimiters in " "name")); break; case NAME_ERR_NOLETTER: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool doesn't begin with a letter")); break; case NAME_ERR_RESERVED: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is reserved")); break; case NAME_ERR_DISKLIKE: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "reserved disk name")); break; case NAME_ERR_SELF_REF: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "self reference, '.' is found in name")); break; case NAME_ERR_PARENT_REF: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent reference, '..' is found in name")); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "(%d) not defined"), why); break; } } return (0); } return (-1); } int zfs_name_valid(const char *name, zfs_type_t type) { if (type == ZFS_TYPE_POOL) return (zpool_name_valid(NULL, B_FALSE, name)); return (zfs_validate_name(NULL, name, type, B_FALSE)); } /* * This function takes the raw DSL properties, and filters out the user-defined * properties into a separate nvlist. */ static nvlist_t * process_user_props(zfs_handle_t *zhp, nvlist_t *props) { libzfs_handle_t *hdl = zhp->zfs_hdl; nvpair_t *elem; nvlist_t *nvl; if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (NULL); } elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { if (!zfs_prop_user(nvpair_name(elem))) continue; nvlist_t *propval = fnvpair_value_nvlist(elem); if (nvlist_add_nvlist(nvl, nvpair_name(elem), propval) != 0) { nvlist_free(nvl); (void) no_memory(hdl); return (NULL); } } return (nvl); } static zpool_handle_t * zpool_add_handle(zfs_handle_t *zhp, const char *pool_name) { libzfs_handle_t *hdl = zhp->zfs_hdl; zpool_handle_t *zph; if ((zph = zpool_open_canfail(hdl, pool_name)) != NULL) { if (hdl->libzfs_pool_handles != NULL) zph->zpool_next = hdl->libzfs_pool_handles; hdl->libzfs_pool_handles = zph; } return (zph); } static zpool_handle_t * zpool_find_handle(zfs_handle_t *zhp, const char *pool_name, int len) { libzfs_handle_t *hdl = zhp->zfs_hdl; zpool_handle_t *zph = hdl->libzfs_pool_handles; while ((zph != NULL) && (strncmp(pool_name, zpool_get_name(zph), len) != 0)) zph = zph->zpool_next; return (zph); } /* * Returns a handle to the pool that contains the provided dataset. * If a handle to that pool already exists then that handle is returned. * Otherwise, a new handle is created and added to the list of handles. */ static zpool_handle_t * zpool_handle(zfs_handle_t *zhp) { char *pool_name; int len; zpool_handle_t *zph; len = strcspn(zhp->zfs_name, "/@#") + 1; pool_name = zfs_alloc(zhp->zfs_hdl, len); (void) strlcpy(pool_name, zhp->zfs_name, len); zph = zpool_find_handle(zhp, pool_name, len); if (zph == NULL) zph = zpool_add_handle(zhp, pool_name); free(pool_name); return (zph); } void zpool_free_handles(libzfs_handle_t *hdl) { zpool_handle_t *next, *zph = hdl->libzfs_pool_handles; while (zph != NULL) { next = zph->zpool_next; zpool_close(zph); zph = next; } hdl->libzfs_pool_handles = NULL; } /* * Utility function to gather stats (objset and zpl) for the given object. */ static int get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc) { libzfs_handle_t *hdl = zhp->zfs_hdl; (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); while (zfs_ioctl(hdl, ZFS_IOC_OBJSET_STATS, zc) != 0) { if (errno == ENOMEM) zcmd_expand_dst_nvlist(hdl, zc); else return (-1); } return (0); } /* * Utility function to get the received properties of the given object. */ static int get_recvd_props_ioctl(zfs_handle_t *zhp) { libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *recvdprops; zfs_cmd_t zc = {"\0"}; int err; zcmd_alloc_dst_nvlist(hdl, &zc, 0); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); while (zfs_ioctl(hdl, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) { if (errno == ENOMEM) zcmd_expand_dst_nvlist(hdl, &zc); else { zcmd_free_nvlists(&zc); return (-1); } } err = zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &recvdprops); zcmd_free_nvlists(&zc); if (err != 0) return (-1); nvlist_free(zhp->zfs_recvd_props); zhp->zfs_recvd_props = recvdprops; return (0); } static int put_stats_zhdl(zfs_handle_t *zhp, zfs_cmd_t *zc) { nvlist_t *allprops, *userprops; zhp->zfs_dmustats = zc->zc_objset_stats; /* structure assignment */ if (zcmd_read_dst_nvlist(zhp->zfs_hdl, zc, &allprops) != 0) { return (-1); } /* * XXX Why do we store the user props separately, in addition to * storing them in zfs_props? */ if ((userprops = process_user_props(zhp, allprops)) == NULL) { nvlist_free(allprops); return (-1); } nvlist_free(zhp->zfs_props); nvlist_free(zhp->zfs_user_props); zhp->zfs_props = allprops; zhp->zfs_user_props = userprops; return (0); } static int get_stats(zfs_handle_t *zhp) { int rc = 0; zfs_cmd_t zc = {"\0"}; zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0); if (get_stats_ioctl(zhp, &zc) != 0) rc = -1; else if (put_stats_zhdl(zhp, &zc) != 0) rc = -1; zcmd_free_nvlists(&zc); return (rc); } /* * Refresh the properties currently stored in the handle. */ void zfs_refresh_properties(zfs_handle_t *zhp) { (void) get_stats(zhp); } /* * Makes a handle from the given dataset name. Used by zfs_open() and * zfs_iter_* to create child handles on the fly. */ static int make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc) { if (put_stats_zhdl(zhp, zc) != 0) return (-1); /* * We've managed to open the dataset and gather statistics. Determine * the high-level type. */ if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) { zhp->zfs_head_type = ZFS_TYPE_VOLUME; } else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS) { zhp->zfs_head_type = ZFS_TYPE_FILESYSTEM; } else if (zhp->zfs_dmustats.dds_type == DMU_OST_OTHER) { errno = EINVAL; return (-1); } else if (zhp->zfs_dmustats.dds_inconsistent) { errno = EBUSY; return (-1); } else { abort(); } if (zhp->zfs_dmustats.dds_is_snapshot) zhp->zfs_type = ZFS_TYPE_SNAPSHOT; else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) zhp->zfs_type = ZFS_TYPE_VOLUME; else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS) zhp->zfs_type = ZFS_TYPE_FILESYSTEM; else abort(); /* we should never see any other types */ if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) return (-1); return (0); } zfs_handle_t * make_dataset_handle(libzfs_handle_t *hdl, const char *path) { zfs_cmd_t zc = {"\0"}; zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = hdl; (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); zcmd_alloc_dst_nvlist(hdl, &zc, 0); if (get_stats_ioctl(zhp, &zc) == -1) { zcmd_free_nvlists(&zc); free(zhp); return (NULL); } if (make_dataset_handle_common(zhp, &zc) == -1) { free(zhp); zhp = NULL; } zcmd_free_nvlists(&zc); return (zhp); } zfs_handle_t * make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = hdl; (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name)); if (make_dataset_handle_common(zhp, zc) == -1) { free(zhp); return (NULL); } return (zhp); } zfs_handle_t * make_dataset_simple_handle_zc(zfs_handle_t *pzhp, zfs_cmd_t *zc) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = pzhp->zfs_hdl; (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name)); zhp->zfs_head_type = pzhp->zfs_type; zhp->zfs_type = ZFS_TYPE_SNAPSHOT; zhp->zpool_hdl = zpool_handle(zhp); zhp->zfs_dmustats = zc->zc_objset_stats; return (zhp); } zfs_handle_t * zfs_handle_dup(zfs_handle_t *zhp_orig) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = zhp_orig->zfs_hdl; zhp->zpool_hdl = zhp_orig->zpool_hdl; (void) strlcpy(zhp->zfs_name, zhp_orig->zfs_name, sizeof (zhp->zfs_name)); zhp->zfs_type = zhp_orig->zfs_type; zhp->zfs_head_type = zhp_orig->zfs_head_type; zhp->zfs_dmustats = zhp_orig->zfs_dmustats; if (zhp_orig->zfs_props != NULL) { if (nvlist_dup(zhp_orig->zfs_props, &zhp->zfs_props, 0) != 0) { (void) no_memory(zhp->zfs_hdl); zfs_close(zhp); return (NULL); } } if (zhp_orig->zfs_user_props != NULL) { if (nvlist_dup(zhp_orig->zfs_user_props, &zhp->zfs_user_props, 0) != 0) { (void) no_memory(zhp->zfs_hdl); zfs_close(zhp); return (NULL); } } if (zhp_orig->zfs_recvd_props != NULL) { if (nvlist_dup(zhp_orig->zfs_recvd_props, &zhp->zfs_recvd_props, 0)) { (void) no_memory(zhp->zfs_hdl); zfs_close(zhp); return (NULL); } } zhp->zfs_mntcheck = zhp_orig->zfs_mntcheck; if (zhp_orig->zfs_mntopts != NULL) { zhp->zfs_mntopts = zfs_strdup(zhp_orig->zfs_hdl, zhp_orig->zfs_mntopts); } zhp->zfs_props_table = zhp_orig->zfs_props_table; return (zhp); } boolean_t zfs_bookmark_exists(const char *path) { nvlist_t *bmarks; nvlist_t *props; char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *bmark_name; char *pound; int err; boolean_t rv; (void) strlcpy(fsname, path, sizeof (fsname)); pound = strchr(fsname, '#'); if (pound == NULL) return (B_FALSE); *pound = '\0'; bmark_name = pound + 1; props = fnvlist_alloc(); err = lzc_get_bookmarks(fsname, props, &bmarks); nvlist_free(props); if (err != 0) { nvlist_free(bmarks); return (B_FALSE); } rv = nvlist_exists(bmarks, bmark_name); nvlist_free(bmarks); return (rv); } zfs_handle_t * make_bookmark_handle(zfs_handle_t *parent, const char *path, nvlist_t *bmark_props) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); /* Fill in the name. */ zhp->zfs_hdl = parent->zfs_hdl; (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); /* Set the property lists. */ if (nvlist_dup(bmark_props, &zhp->zfs_props, 0) != 0) { free(zhp); return (NULL); } /* Set the types. */ zhp->zfs_head_type = parent->zfs_head_type; zhp->zfs_type = ZFS_TYPE_BOOKMARK; if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) { nvlist_free(zhp->zfs_props); free(zhp); return (NULL); } return (zhp); } struct zfs_open_bookmarks_cb_data { const char *path; zfs_handle_t *zhp; }; static int zfs_open_bookmarks_cb(zfs_handle_t *zhp, void *data) { struct zfs_open_bookmarks_cb_data *dp = data; /* * Is it the one we are looking for? */ if (strcmp(dp->path, zfs_get_name(zhp)) == 0) { /* * We found it. Save it and let the caller know we are done. */ dp->zhp = zhp; return (EEXIST); } /* * Not found. Close the handle and ask for another one. */ zfs_close(zhp); return (0); } /* * Opens the given snapshot, bookmark, filesystem, or volume. The 'types' * argument is a mask of acceptable types. The function will print an * appropriate error message and return NULL if it can't be opened. */ zfs_handle_t * zfs_open(libzfs_handle_t *hdl, const char *path, int types) { zfs_handle_t *zhp; char errbuf[ERRBUFLEN]; char *bookp; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot open '%s'"), path); /* * Validate the name before we even try to open it. */ if (!zfs_validate_name(hdl, path, types, B_FALSE)) { (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); return (NULL); } /* * Bookmarks needs to be handled separately. */ bookp = strchr(path, '#'); if (bookp == NULL) { /* * Try to get stats for the dataset, which will tell us if it * exists. */ errno = 0; if ((zhp = make_dataset_handle(hdl, path)) == NULL) { (void) zfs_standard_error(hdl, errno, errbuf); return (NULL); } } else { char dsname[ZFS_MAX_DATASET_NAME_LEN]; zfs_handle_t *pzhp; struct zfs_open_bookmarks_cb_data cb_data = {path, NULL}; /* * We need to cut out '#' and everything after '#' * to get the parent dataset name only. */ assert(bookp - path < sizeof (dsname)); - (void) strncpy(dsname, path, bookp - path); - dsname[bookp - path] = '\0'; + (void) strlcpy(dsname, path, + MIN(sizeof (dsname), bookp - path + 1)); /* * Create handle for the parent dataset. */ errno = 0; if ((pzhp = make_dataset_handle(hdl, dsname)) == NULL) { (void) zfs_standard_error(hdl, errno, errbuf); return (NULL); } /* * Iterate bookmarks to find the right one. */ errno = 0; if ((zfs_iter_bookmarks(pzhp, zfs_open_bookmarks_cb, &cb_data) == 0) && (cb_data.zhp == NULL)) { (void) zfs_error(hdl, EZFS_NOENT, errbuf); zfs_close(pzhp); return (NULL); } if (cb_data.zhp == NULL) { (void) zfs_standard_error(hdl, errno, errbuf); zfs_close(pzhp); return (NULL); } zhp = cb_data.zhp; /* * Cleanup. */ zfs_close(pzhp); } if (!(types & zhp->zfs_type)) { (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); zfs_close(zhp); return (NULL); } return (zhp); } /* * Release a ZFS handle. Nothing to do but free the associated memory. */ void zfs_close(zfs_handle_t *zhp) { if (zhp->zfs_mntopts) free(zhp->zfs_mntopts); nvlist_free(zhp->zfs_props); nvlist_free(zhp->zfs_user_props); nvlist_free(zhp->zfs_recvd_props); free(zhp); } typedef struct mnttab_node { struct mnttab mtn_mt; avl_node_t mtn_node; } mnttab_node_t; static int libzfs_mnttab_cache_compare(const void *arg1, const void *arg2) { const mnttab_node_t *mtn1 = (const mnttab_node_t *)arg1; const mnttab_node_t *mtn2 = (const mnttab_node_t *)arg2; int rv; rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special); return (TREE_ISIGN(rv)); } void libzfs_mnttab_init(libzfs_handle_t *hdl) { pthread_mutex_init(&hdl->libzfs_mnttab_cache_lock, NULL); assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0); avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare, sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node)); } static int libzfs_mnttab_update(libzfs_handle_t *hdl) { FILE *mnttab; struct mnttab entry; if ((mnttab = fopen(MNTTAB, "re")) == NULL) return (ENOENT); while (getmntent(mnttab, &entry) == 0) { mnttab_node_t *mtn; avl_index_t where; if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) continue; mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); mtn->mtn_mt.mnt_special = zfs_strdup(hdl, entry.mnt_special); mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, entry.mnt_mountp); mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, entry.mnt_fstype); mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts); /* Exclude duplicate mounts */ if (avl_find(&hdl->libzfs_mnttab_cache, mtn, &where) != NULL) { free(mtn->mtn_mt.mnt_special); free(mtn->mtn_mt.mnt_mountp); free(mtn->mtn_mt.mnt_fstype); free(mtn->mtn_mt.mnt_mntopts); free(mtn); continue; } avl_add(&hdl->libzfs_mnttab_cache, mtn); } (void) fclose(mnttab); return (0); } void libzfs_mnttab_fini(libzfs_handle_t *hdl) { void *cookie = NULL; mnttab_node_t *mtn; while ((mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie)) != NULL) { free(mtn->mtn_mt.mnt_special); free(mtn->mtn_mt.mnt_mountp); free(mtn->mtn_mt.mnt_fstype); free(mtn->mtn_mt.mnt_mntopts); free(mtn); } avl_destroy(&hdl->libzfs_mnttab_cache); (void) pthread_mutex_destroy(&hdl->libzfs_mnttab_cache_lock); } void libzfs_mnttab_cache(libzfs_handle_t *hdl, boolean_t enable) { hdl->libzfs_mnttab_enable = enable; } int libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname, struct mnttab *entry) { FILE *mnttab; mnttab_node_t find; mnttab_node_t *mtn; int ret = ENOENT; if (!hdl->libzfs_mnttab_enable) { struct mnttab srch = { 0 }; if (avl_numnodes(&hdl->libzfs_mnttab_cache)) libzfs_mnttab_fini(hdl); if ((mnttab = fopen(MNTTAB, "re")) == NULL) return (ENOENT); srch.mnt_special = (char *)fsname; srch.mnt_fstype = (char *)MNTTYPE_ZFS; ret = getmntany(mnttab, entry, &srch) ? ENOENT : 0; (void) fclose(mnttab); return (ret); } pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) { int error; if ((error = libzfs_mnttab_update(hdl)) != 0) { pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); return (error); } } find.mtn_mt.mnt_special = (char *)fsname; mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL); if (mtn) { *entry = mtn->mtn_mt; ret = 0; } pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); return (ret); } void libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special, const char *mountp, const char *mntopts) { mnttab_node_t *mtn; pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); if (avl_numnodes(&hdl->libzfs_mnttab_cache) != 0) { mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special); mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp); mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS); mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts); /* * Another thread may have already added this entry * via libzfs_mnttab_update. If so we should skip it. */ if (avl_find(&hdl->libzfs_mnttab_cache, mtn, NULL) != NULL) { free(mtn->mtn_mt.mnt_special); free(mtn->mtn_mt.mnt_mountp); free(mtn->mtn_mt.mnt_fstype); free(mtn->mtn_mt.mnt_mntopts); free(mtn); } else { avl_add(&hdl->libzfs_mnttab_cache, mtn); } } pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); } void libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname) { mnttab_node_t find; mnttab_node_t *ret; pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); find.mtn_mt.mnt_special = (char *)fsname; if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) != NULL) { avl_remove(&hdl->libzfs_mnttab_cache, ret); free(ret->mtn_mt.mnt_special); free(ret->mtn_mt.mnt_mountp); free(ret->mtn_mt.mnt_fstype); free(ret->mtn_mt.mnt_mntopts); free(ret); } pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); } int zfs_spa_version(zfs_handle_t *zhp, int *spa_version) { zpool_handle_t *zpool_handle = zhp->zpool_hdl; if (zpool_handle == NULL) return (-1); *spa_version = zpool_get_prop_int(zpool_handle, ZPOOL_PROP_VERSION, NULL); return (0); } /* * The choice of reservation property depends on the SPA version. */ static int zfs_which_resv_prop(zfs_handle_t *zhp, zfs_prop_t *resv_prop) { int spa_version; if (zfs_spa_version(zhp, &spa_version) < 0) return (-1); if (spa_version >= SPA_VERSION_REFRESERVATION) *resv_prop = ZFS_PROP_REFRESERVATION; else *resv_prop = ZFS_PROP_RESERVATION; return (0); } /* * Given an nvlist of properties to set, validates that they are correct, and * parses any numeric properties (index, boolean, etc) if they are specified as * strings. */ nvlist_t * zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, uint64_t zoned, zfs_handle_t *zhp, zpool_handle_t *zpool_hdl, boolean_t key_params_ok, const char *errbuf) { nvpair_t *elem; uint64_t intval; char *strval; zfs_prop_t prop; nvlist_t *ret; int chosen_normal = -1; int chosen_utf = -1; if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (NULL); } /* * Make sure this property is valid and applies to this type. */ elem = NULL; while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { const char *propname = nvpair_name(elem); prop = zfs_name_to_prop(propname); if (prop == ZPROP_USERPROP && zfs_prop_user(propname)) { /* * This is a user property: make sure it's a * string, and that it's less than ZAP_MAXNAMELEN. */ if (nvpair_type(elem) != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property name '%s' is too long"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } (void) nvpair_value_string(elem, &strval); if (nvlist_add_string(ret, propname, strval) != 0) { (void) no_memory(hdl); goto error; } continue; } /* * Currently, only user properties can be modified on * snapshots. */ if (type == ZFS_TYPE_SNAPSHOT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "this property can not be modified for snapshots")); (void) zfs_error(hdl, EZFS_PROPTYPE, errbuf); goto error; } if (prop == ZPROP_USERPROP && zfs_prop_userquota(propname)) { zfs_userquota_prop_t uqtype; char *newpropname = NULL; char domain[128]; uint64_t rid; uint64_t valary[3]; int rc; if (userquota_propname_decode(propname, zoned, &uqtype, domain, sizeof (domain), &rid) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' has an invalid user/group name"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (uqtype != ZFS_PROP_USERQUOTA && uqtype != ZFS_PROP_GROUPQUOTA && uqtype != ZFS_PROP_USEROBJQUOTA && uqtype != ZFS_PROP_GROUPOBJQUOTA && uqtype != ZFS_PROP_PROJECTQUOTA && uqtype != ZFS_PROP_PROJECTOBJQUOTA) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (nvpair_type(elem) == DATA_TYPE_STRING) { (void) nvpair_value_string(elem, &strval); if (strcmp(strval, "none") == 0) { intval = 0; } else if (zfs_nicestrtonum(hdl, strval, &intval) != 0) { (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { (void) nvpair_value_uint64(elem, &intval); if (intval == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "use 'none' to disable " "{user|group|project}quota")); goto error; } } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a number"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } /* * Encode the prop name as * userquota@-domain, to make it easy * for the kernel to decode. */ rc = asprintf(&newpropname, "%s%llx-%s", zfs_userquota_prop_prefixes[uqtype], (longlong_t)rid, domain); if (rc == -1 || newpropname == NULL) { (void) no_memory(hdl); goto error; } valary[0] = uqtype; valary[1] = rid; valary[2] = intval; if (nvlist_add_uint64_array(ret, newpropname, valary, 3) != 0) { free(newpropname); (void) no_memory(hdl); goto error; } free(newpropname); continue; } else if (prop == ZPROP_USERPROP && zfs_prop_written(propname)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (prop == ZPROP_INVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (!zfs_prop_valid_for_type(prop, type, B_FALSE)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' does not " "apply to datasets of this type"), propname); (void) zfs_error(hdl, EZFS_PROPTYPE, errbuf); goto error; } if (zfs_prop_readonly(prop) && !(zfs_prop_setonce(prop) && zhp == NULL) && !(zfs_prop_encryption_key_param(prop) && key_params_ok)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (zprop_parse_value(hdl, elem, prop, type, ret, &strval, &intval, errbuf) != 0) goto error; /* * Perform some additional checks for specific properties. */ switch (prop) { case ZFS_PROP_VERSION: { int version; if (zhp == NULL) break; version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); if (intval < version) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Can not downgrade; already at version %u"), version); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; } case ZFS_PROP_VOLBLOCKSIZE: case ZFS_PROP_RECORDSIZE: { int maxbs = SPA_MAXBLOCKSIZE; char buf[64]; if (zpool_hdl != NULL) { maxbs = zpool_get_prop_int(zpool_hdl, ZPOOL_PROP_MAXBLOCKSIZE, NULL); } /* * The value must be a power of two between * SPA_MINBLOCKSIZE and maxbs. */ if (intval < SPA_MINBLOCKSIZE || intval > maxbs || !ISP2(intval)) { zfs_nicebytes(maxbs, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be power of 2 from 512B " "to %s"), propname, buf); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; } case ZFS_PROP_SPECIAL_SMALL_BLOCKS: { int maxbs = SPA_OLD_MAXBLOCKSIZE; char buf[64]; if (zpool_hdl != NULL) { char state[64] = ""; maxbs = zpool_get_prop_int(zpool_hdl, ZPOOL_PROP_MAXBLOCKSIZE, NULL); /* * Issue a warning but do not fail so that * tests for settable properties succeed. */ if (zpool_prop_get_feature(zpool_hdl, "feature@allocation_classes", state, sizeof (state)) != 0 || strcmp(state, ZFS_FEATURE_ACTIVE) != 0) { (void) fprintf(stderr, gettext( "%s: property requires a special " "device in the pool\n"), propname); } } if (intval != 0 && (intval < SPA_MINBLOCKSIZE || intval > maxbs || !ISP2(intval))) { zfs_nicebytes(maxbs, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid '%s=%llu' property: must be zero " "or a power of 2 from 512B to %s"), propname, (unsigned long long)intval, buf); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; } case ZFS_PROP_MLSLABEL: { #ifdef HAVE_MLSLABEL /* * Verify the mlslabel string and convert to * internal hex label string. */ m_label_t *new_sl; char *hex = NULL; /* internal label string */ /* Default value is already OK. */ if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) break; /* Verify the label can be converted to binary form */ if (((new_sl = m_label_alloc(MAC_LABEL)) == NULL) || (str_to_label(strval, &new_sl, MAC_LABEL, L_NO_CORRECTION, NULL) == -1)) { goto badlabel; } /* Now translate to hex internal label string */ if (label_to_str(new_sl, &hex, M_INTERNAL, DEF_NAMES) != 0) { if (hex) free(hex); goto badlabel; } m_label_free(new_sl); /* If string is already in internal form, we're done. */ if (strcmp(strval, hex) == 0) { free(hex); break; } /* Replace the label string with the internal form. */ (void) nvlist_remove(ret, zfs_prop_to_name(prop), DATA_TYPE_STRING); fnvlist_add_string(ret, zfs_prop_to_name(prop), hex); free(hex); break; badlabel: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid mlslabel '%s'"), strval); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); m_label_free(new_sl); /* OK if null */ goto error; #else zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "mlslabels are unsupported")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; #endif /* HAVE_MLSLABEL */ } case ZFS_PROP_MOUNTPOINT: { namecheck_err_t why; if (strcmp(strval, ZFS_MOUNTPOINT_NONE) == 0 || strcmp(strval, ZFS_MOUNTPOINT_LEGACY) == 0) break; if (mountpoint_namecheck(strval, &why)) { switch (why) { case NAME_ERR_LEADING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be an absolute path, " "'none', or 'legacy'"), propname); break; case NAME_ERR_TOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "component of '%s' is too long"), propname); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "(%d) not defined"), why); break; } (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } zfs_fallthrough; } case ZFS_PROP_SHARESMB: case ZFS_PROP_SHARENFS: /* * For the mountpoint and sharenfs or sharesmb * properties, check if it can be set in a * global/non-global zone based on * the zoned property value: * * global zone non-global zone * -------------------------------------------------- * zoned=on mountpoint (no) mountpoint (yes) * sharenfs (no) sharenfs (no) * sharesmb (no) sharesmb (no) * * zoned=off mountpoint (yes) N/A * sharenfs (yes) * sharesmb (yes) */ if (zoned) { if (getzoneid() == GLOBAL_ZONEID) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set on " "dataset in a non-global zone"), propname); (void) zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } else if (prop == ZFS_PROP_SHARENFS || prop == ZFS_PROP_SHARESMB) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set in " "a non-global zone"), propname); (void) zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } } else if (getzoneid() != GLOBAL_ZONEID) { /* * If zoned property is 'off', this must be in * a global zone. If not, something is wrong. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set while dataset " "'zoned' property is set"), propname); (void) zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } /* * At this point, it is legitimate to set the * property. Now we want to make sure that the * property value is valid if it is sharenfs. */ if ((prop == ZFS_PROP_SHARENFS || prop == ZFS_PROP_SHARESMB) && strcmp(strval, "on") != 0 && strcmp(strval, "off") != 0) { enum sa_protocol proto; if (prop == ZFS_PROP_SHARESMB) proto = SA_PROTOCOL_SMB; else proto = SA_PROTOCOL_NFS; if (sa_validate_shareopts(strval, proto) != SA_OK) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set to invalid " "options"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } break; case ZFS_PROP_KEYLOCATION: if (!zfs_prop_valid_keylocation(strval, B_FALSE)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid keylocation")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (zhp != NULL) { uint64_t crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); if (crypt == ZIO_CRYPT_OFF && strcmp(strval, "none") != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "keylocation must be 'none' " "for unencrypted datasets")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } else if (crypt != ZIO_CRYPT_OFF && strcmp(strval, "none") == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "keylocation must not be 'none' " "for encrypted datasets")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } break; case ZFS_PROP_PBKDF2_ITERS: if (intval < MIN_PBKDF2_ITERATIONS) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "minimum pbkdf2 iterations is %u"), MIN_PBKDF2_ITERATIONS); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZFS_PROP_UTF8ONLY: chosen_utf = (int)intval; break; case ZFS_PROP_NORMALIZE: chosen_normal = (int)intval; break; default: break; } /* * For changes to existing volumes, we have some additional * checks to enforce. */ if (type == ZFS_TYPE_VOLUME && zhp != NULL) { uint64_t blocksize = zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE); char buf[64]; switch (prop) { case ZFS_PROP_VOLSIZE: if (intval % blocksize != 0) { zfs_nicebytes(blocksize, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a multiple of " "volume block size (%s)"), propname, buf); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (intval == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be zero"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; default: break; } } /* check encryption properties */ if (zhp != NULL) { int64_t crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); switch (prop) { case ZFS_PROP_COPIES: if (crypt != ZIO_CRYPT_OFF && intval > 2) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encrypted datasets cannot have " "3 copies")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; default: break; } } } /* * If normalization was chosen, but no UTF8 choice was made, * enforce rejection of non-UTF8 names. * * If normalization was chosen, but rejecting non-UTF8 names * was explicitly not chosen, it is an error. * * If utf8only was turned off, but the parent has normalization, * turn off normalization. */ if (chosen_normal > 0 && chosen_utf < 0) { if (nvlist_add_uint64(ret, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), 1) != 0) { (void) no_memory(hdl); goto error; } } else if (chosen_normal > 0 && chosen_utf == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be set 'on' if normalization chosen"), zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } else if (chosen_normal < 0 && chosen_utf == 0) { if (nvlist_add_uint64(ret, zfs_prop_to_name(ZFS_PROP_NORMALIZE), 0) != 0) { (void) no_memory(hdl); goto error; } } return (ret); error: nvlist_free(ret); return (NULL); } static int zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) { uint64_t old_volsize; uint64_t new_volsize; uint64_t old_reservation; uint64_t new_reservation; zfs_prop_t resv_prop; nvlist_t *props; zpool_handle_t *zph = zpool_handle(zhp); /* * If this is an existing volume, and someone is setting the volsize, * make sure that it matches the reservation, or add it if necessary. */ old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); if (zfs_which_resv_prop(zhp, &resv_prop) < 0) return (-1); old_reservation = zfs_prop_get_int(zhp, resv_prop); props = fnvlist_alloc(); fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); if ((zvol_volsize_to_reservation(zph, old_volsize, props) != old_reservation) || nvlist_exists(nvl, zfs_prop_to_name(resv_prop))) { fnvlist_free(props); return (0); } if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &new_volsize) != 0) { fnvlist_free(props); return (-1); } new_reservation = zvol_volsize_to_reservation(zph, new_volsize, props); fnvlist_free(props); if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop), new_reservation) != 0) { (void) no_memory(zhp->zfs_hdl); return (-1); } return (1); } /* * Helper for 'zfs {set|clone} refreservation=auto'. Must be called after * zfs_valid_proplist(), as it is what sets the UINT64_MAX sentinel value. * Return codes must match zfs_add_synthetic_resv(). */ static int zfs_fix_auto_resv(zfs_handle_t *zhp, nvlist_t *nvl) { uint64_t volsize; uint64_t resvsize; zfs_prop_t prop; nvlist_t *props; if (!ZFS_IS_VOLUME(zhp)) { return (0); } if (zfs_which_resv_prop(zhp, &prop) != 0) { return (-1); } if (prop != ZFS_PROP_REFRESERVATION) { return (0); } if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(prop), &resvsize) != 0) { /* No value being set, so it can't be "auto" */ return (0); } if (resvsize != UINT64_MAX) { /* Being set to a value other than "auto" */ return (0); } props = fnvlist_alloc(); fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) { volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); } resvsize = zvol_volsize_to_reservation(zpool_handle(zhp), volsize, props); fnvlist_free(props); (void) nvlist_remove_all(nvl, zfs_prop_to_name(prop)); if (nvlist_add_uint64(nvl, zfs_prop_to_name(prop), resvsize) != 0) { (void) no_memory(zhp->zfs_hdl); return (-1); } return (1); } static boolean_t zfs_is_namespace_prop(zfs_prop_t prop) { switch (prop) { case ZFS_PROP_ATIME: case ZFS_PROP_RELATIME: case ZFS_PROP_DEVICES: case ZFS_PROP_EXEC: case ZFS_PROP_SETUID: case ZFS_PROP_READONLY: case ZFS_PROP_XATTR: case ZFS_PROP_NBMAND: return (B_TRUE); default: return (B_FALSE); } } /* * Given a property name and value, set the property for the given dataset. */ int zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) { int ret = -1; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *nvl = NULL; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), zhp->zfs_name); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_string(nvl, propname, propval) != 0) { (void) no_memory(hdl); goto error; } ret = zfs_prop_set_list(zhp, nvl); error: nvlist_free(nvl); return (ret); } /* * Given an nvlist of property names and values, set the properties for the * given dataset. */ int zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props) { zfs_cmd_t zc = {"\0"}; int ret = -1; prop_changelist_t **cls = NULL; int cl_idx; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *nvl; int nvl_len = 0; int added_resv = 0; zfs_prop_t prop = 0; nvpair_t *elem; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), zhp->zfs_name); if ((nvl = zfs_valid_proplist(hdl, zhp->zfs_type, props, zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, zhp->zpool_hdl, B_FALSE, errbuf)) == NULL) goto error; /* * We have to check for any extra properties which need to be added * before computing the length of the nvlist. */ for (elem = nvlist_next_nvpair(nvl, NULL); elem != NULL; elem = nvlist_next_nvpair(nvl, elem)) { if (zfs_name_to_prop(nvpair_name(elem)) == ZFS_PROP_VOLSIZE && (added_resv = zfs_add_synthetic_resv(zhp, nvl)) == -1) { goto error; } } if (added_resv != 1 && (added_resv = zfs_fix_auto_resv(zhp, nvl)) == -1) { goto error; } /* * Check how many properties we're setting and allocate an array to * store changelist pointers for postfix(). */ for (elem = nvlist_next_nvpair(nvl, NULL); elem != NULL; elem = nvlist_next_nvpair(nvl, elem)) nvl_len++; if ((cls = calloc(nvl_len, sizeof (prop_changelist_t *))) == NULL) goto error; cl_idx = 0; for (elem = nvlist_next_nvpair(nvl, NULL); elem != NULL; elem = nvlist_next_nvpair(nvl, elem)) { prop = zfs_name_to_prop(nvpair_name(elem)); assert(cl_idx < nvl_len); /* * We don't want to unmount & remount the dataset when changing * its canmount property to 'on' or 'noauto'. We only use * the changelist logic to unmount when setting canmount=off. */ if (prop != ZFS_PROP_CANMOUNT || (fnvpair_value_uint64(elem) == ZFS_CANMOUNT_OFF && zfs_is_mounted(zhp, NULL))) { cls[cl_idx] = changelist_gather(zhp, prop, 0, 0); if (cls[cl_idx] == NULL) goto error; } if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cls[cl_idx])) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "child dataset with inherited mountpoint is used " "in a non-global zone")); ret = zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } if (cls[cl_idx] != NULL && (ret = changelist_prefix(cls[cl_idx])) != 0) goto error; cl_idx++; } assert(cl_idx == nvl_len); /* * Execute the corresponding ioctl() to set this list of properties. */ (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zcmd_write_src_nvlist(hdl, &zc, nvl); zcmd_alloc_dst_nvlist(hdl, &zc, 0); ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); if (ret != 0) { if (zc.zc_nvlist_dst_filled == B_FALSE) { (void) zfs_standard_error(hdl, errno, errbuf); goto error; } /* Get the list of unset properties back and report them. */ nvlist_t *errorprops = NULL; if (zcmd_read_dst_nvlist(hdl, &zc, &errorprops) != 0) goto error; for (nvpair_t *elem = nvlist_next_nvpair(errorprops, NULL); elem != NULL; elem = nvlist_next_nvpair(errorprops, elem)) { prop = zfs_name_to_prop(nvpair_name(elem)); zfs_setprop_error(hdl, prop, errno, errbuf); } nvlist_free(errorprops); if (added_resv && errno == ENOSPC) { /* clean up the volsize property we tried to set */ uint64_t old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); nvlist_free(nvl); nvl = NULL; zcmd_free_nvlists(&zc); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) goto error; if (nvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), old_volsize) != 0) goto error; zcmd_write_src_nvlist(hdl, &zc, nvl); (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); } } else { for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) { if (cls[cl_idx] != NULL) { int clp_err = changelist_postfix(cls[cl_idx]); if (clp_err != 0) ret = clp_err; } } if (ret == 0) { /* * Refresh the statistics so the new property * value is reflected. */ (void) get_stats(zhp); /* * Remount the filesystem to propagate the change * if one of the options handled by the generic * Linux namespace layer has been modified. */ if (zfs_is_namespace_prop(prop) && zfs_is_mounted(zhp, NULL)) ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0); } } error: nvlist_free(nvl); zcmd_free_nvlists(&zc); if (cls != NULL) { for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) { if (cls[cl_idx] != NULL) changelist_free(cls[cl_idx]); } free(cls); } return (ret); } /* * Given a property, inherit the value from the parent dataset, or if received * is TRUE, revert to the received value, if any. */ int zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received) { zfs_cmd_t zc = {"\0"}; int ret; prop_changelist_t *cl; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[ERRBUFLEN]; zfs_prop_t prop; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot inherit %s for '%s'"), propname, zhp->zfs_name); zc.zc_cookie = received; if ((prop = zfs_name_to_prop(propname)) == ZPROP_USERPROP) { /* * For user properties, the amount of work we have to do is very * small, so just do it here. */ if (!zfs_prop_user(propname)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value)); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc) != 0) return (zfs_standard_error(hdl, errno, errbuf)); (void) get_stats(zhp); return (0); } /* * Verify that this property is inheritable. */ if (zfs_prop_readonly(prop)) return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf)); if (!zfs_prop_inheritable(prop) && !received) return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf)); /* * Check to see if the value applies to this type */ if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) return (zfs_error(hdl, EZFS_PROPTYPE, errbuf)); /* * Normalize the name, to get rid of shorthand abbreviations. */ propname = zfs_prop_to_name(prop); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value)); if (prop == ZFS_PROP_MOUNTPOINT && getzoneid() == GLOBAL_ZONEID && zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset is used in a non-global zone")); return (zfs_error(hdl, EZFS_ZONED, errbuf)); } /* * Determine datasets which will be affected by this change, if any. */ if ((cl = changelist_gather(zhp, prop, 0, 0)) == NULL) return (-1); if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "child dataset with inherited mountpoint is used " "in a non-global zone")); ret = zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } if ((ret = changelist_prefix(cl)) != 0) goto error; if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc)) != 0) { changelist_free(cl); return (zfs_standard_error(hdl, errno, errbuf)); } else { if ((ret = changelist_postfix(cl)) != 0) goto error; /* * Refresh the statistics so the new property is reflected. */ (void) get_stats(zhp); /* * Remount the filesystem to propagate the change * if one of the options handled by the generic * Linux namespace layer has been modified. */ if (zfs_is_namespace_prop(prop) && zfs_is_mounted(zhp, NULL)) ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0); } error: changelist_free(cl); return (ret); } /* * True DSL properties are stored in an nvlist. The following two functions * extract them appropriately. */ uint64_t getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, char **source) { nvlist_t *nv; uint64_t value; *source = NULL; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(prop), &nv) == 0) { value = fnvlist_lookup_uint64(nv, ZPROP_VALUE); (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); } else { verify(!zhp->zfs_props_table || zhp->zfs_props_table[prop] == B_TRUE); value = zfs_prop_default_numeric(prop); *source = (char *)""; } return (value); } static const char * getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source) { nvlist_t *nv; const char *value; *source = NULL; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(prop), &nv) == 0) { value = fnvlist_lookup_string(nv, ZPROP_VALUE); (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); } else { verify(!zhp->zfs_props_table || zhp->zfs_props_table[prop] == B_TRUE); value = zfs_prop_default_string(prop); *source = (char *)""; } return (value); } static boolean_t zfs_is_recvd_props_mode(zfs_handle_t *zhp) { return (zhp->zfs_props == zhp->zfs_recvd_props); } static void zfs_set_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie) { *cookie = (uint64_t)(uintptr_t)zhp->zfs_props; zhp->zfs_props = zhp->zfs_recvd_props; } static void zfs_unset_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie) { zhp->zfs_props = (nvlist_t *)(uintptr_t)*cookie; *cookie = 0; } /* * Internal function for getting a numeric property. Both zfs_prop_get() and * zfs_prop_get_int() are built using this interface. * * Certain properties can be overridden using 'mount -o'. In this case, scan * the contents of the /proc/self/mounts entry, searching for the * appropriate options. If they differ from the on-disk values, report the * current values and mark the source "temporary". */ static int get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, char **source, uint64_t *val) { zfs_cmd_t zc = {"\0"}; nvlist_t *zplprops = NULL; struct mnttab mnt; const char *mntopt_on = NULL; const char *mntopt_off = NULL; boolean_t received = zfs_is_recvd_props_mode(zhp); *source = NULL; /* * If the property is being fetched for a snapshot, check whether * the property is valid for the snapshot's head dataset type. */ if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT && !zfs_prop_valid_for_type(prop, zhp->zfs_head_type, B_TRUE)) { *val = zfs_prop_default_numeric(prop); return (-1); } switch (prop) { case ZFS_PROP_ATIME: mntopt_on = MNTOPT_ATIME; mntopt_off = MNTOPT_NOATIME; break; case ZFS_PROP_RELATIME: mntopt_on = MNTOPT_RELATIME; mntopt_off = MNTOPT_NORELATIME; break; case ZFS_PROP_DEVICES: mntopt_on = MNTOPT_DEVICES; mntopt_off = MNTOPT_NODEVICES; break; case ZFS_PROP_EXEC: mntopt_on = MNTOPT_EXEC; mntopt_off = MNTOPT_NOEXEC; break; case ZFS_PROP_READONLY: mntopt_on = MNTOPT_RO; mntopt_off = MNTOPT_RW; break; case ZFS_PROP_SETUID: mntopt_on = MNTOPT_SETUID; mntopt_off = MNTOPT_NOSETUID; break; case ZFS_PROP_XATTR: mntopt_on = MNTOPT_XATTR; mntopt_off = MNTOPT_NOXATTR; break; case ZFS_PROP_NBMAND: mntopt_on = MNTOPT_NBMAND; mntopt_off = MNTOPT_NONBMAND; break; default: break; } /* * Because looking up the mount options is potentially expensive * (iterating over all of /proc/self/mounts), we defer its * calculation until we're looking up a property which requires * its presence. */ if (!zhp->zfs_mntcheck && (mntopt_on != NULL || prop == ZFS_PROP_MOUNTED)) { libzfs_handle_t *hdl = zhp->zfs_hdl; struct mnttab entry; if (libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0) zhp->zfs_mntopts = zfs_strdup(hdl, entry.mnt_mntopts); zhp->zfs_mntcheck = B_TRUE; } if (zhp->zfs_mntopts == NULL) mnt.mnt_mntopts = (char *)""; else mnt.mnt_mntopts = zhp->zfs_mntopts; switch (prop) { case ZFS_PROP_ATIME: case ZFS_PROP_RELATIME: case ZFS_PROP_DEVICES: case ZFS_PROP_EXEC: case ZFS_PROP_READONLY: case ZFS_PROP_SETUID: #ifndef __FreeBSD__ case ZFS_PROP_XATTR: #endif case ZFS_PROP_NBMAND: *val = getprop_uint64(zhp, prop, source); if (received) break; if (hasmntopt(&mnt, mntopt_on) && !*val) { *val = B_TRUE; if (src) *src = ZPROP_SRC_TEMPORARY; } else if (hasmntopt(&mnt, mntopt_off) && *val) { *val = B_FALSE; if (src) *src = ZPROP_SRC_TEMPORARY; } break; case ZFS_PROP_CANMOUNT: case ZFS_PROP_VOLSIZE: case ZFS_PROP_QUOTA: case ZFS_PROP_REFQUOTA: case ZFS_PROP_RESERVATION: case ZFS_PROP_REFRESERVATION: case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: case ZFS_PROP_FILESYSTEM_COUNT: case ZFS_PROP_SNAPSHOT_COUNT: *val = getprop_uint64(zhp, prop, source); if (*source == NULL) { /* not default, must be local */ *source = zhp->zfs_name; } break; case ZFS_PROP_MOUNTED: *val = (zhp->zfs_mntopts != NULL); break; case ZFS_PROP_NUMCLONES: *val = zhp->zfs_dmustats.dds_num_clones; break; case ZFS_PROP_VERSION: case ZFS_PROP_NORMALIZE: case ZFS_PROP_UTF8ONLY: case ZFS_PROP_CASE: zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) { zcmd_free_nvlists(&zc); if (prop == ZFS_PROP_VERSION && zhp->zfs_type == ZFS_TYPE_VOLUME) *val = zfs_prop_default_numeric(prop); return (-1); } if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 || nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop), val) != 0) { zcmd_free_nvlists(&zc); return (-1); } nvlist_free(zplprops); zcmd_free_nvlists(&zc); break; case ZFS_PROP_INCONSISTENT: *val = zhp->zfs_dmustats.dds_inconsistent; break; case ZFS_PROP_REDACTED: *val = zhp->zfs_dmustats.dds_redacted; break; case ZFS_PROP_CREATETXG: /* * We can directly read createtxg property from zfs * handle for Filesystem, Snapshot and ZVOL types. */ if ((zhp->zfs_type == ZFS_TYPE_FILESYSTEM) || (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) || (zhp->zfs_type == ZFS_TYPE_VOLUME)) { *val = zhp->zfs_dmustats.dds_creation_txg; break; } zfs_fallthrough; default: switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: *val = getprop_uint64(zhp, prop, source); /* * If we tried to use a default value for a * readonly property, it means that it was not * present. Note this only applies to "truly" * readonly properties, not set-once properties * like volblocksize. */ if (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop) && *source != NULL && (*source)[0] == '\0') { *source = NULL; return (-1); } break; case PROP_TYPE_STRING: default: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "cannot get non-numeric property")); return (zfs_error(zhp->zfs_hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN, "internal error"))); } } return (0); } /* * Calculate the source type, given the raw source string. */ static void get_source(zfs_handle_t *zhp, zprop_source_t *srctype, char *source, char *statbuf, size_t statlen) { if (statbuf == NULL || srctype == NULL || *srctype == ZPROP_SRC_TEMPORARY) { return; } if (source == NULL) { *srctype = ZPROP_SRC_NONE; } else if (source[0] == '\0') { *srctype = ZPROP_SRC_DEFAULT; } else if (strstr(source, ZPROP_SOURCE_VAL_RECVD) != NULL) { *srctype = ZPROP_SRC_RECEIVED; } else { if (strcmp(source, zhp->zfs_name) == 0) { *srctype = ZPROP_SRC_LOCAL; } else { (void) strlcpy(statbuf, source, statlen); *srctype = ZPROP_SRC_INHERITED; } } } int zfs_prop_get_recvd(zfs_handle_t *zhp, const char *propname, char *propbuf, size_t proplen, boolean_t literal) { zfs_prop_t prop; int err = 0; if (zhp->zfs_recvd_props == NULL) if (get_recvd_props_ioctl(zhp) != 0) return (-1); prop = zfs_name_to_prop(propname); if (prop != ZPROP_USERPROP) { uint64_t cookie; if (!nvlist_exists(zhp->zfs_recvd_props, propname)) return (-1); zfs_set_recvd_props_mode(zhp, &cookie); err = zfs_prop_get(zhp, prop, propbuf, proplen, NULL, NULL, 0, literal); zfs_unset_recvd_props_mode(zhp, &cookie); } else { nvlist_t *propval; char *recvdval; if (nvlist_lookup_nvlist(zhp->zfs_recvd_props, propname, &propval) != 0) return (-1); recvdval = fnvlist_lookup_string(propval, ZPROP_VALUE); (void) strlcpy(propbuf, recvdval, proplen); } return (err == 0 ? 0 : -1); } static int get_clones_string(zfs_handle_t *zhp, char *propbuf, size_t proplen) { nvlist_t *value; nvpair_t *pair; value = zfs_get_clones_nvl(zhp); if (value == NULL || nvlist_empty(value)) return (-1); propbuf[0] = '\0'; for (pair = nvlist_next_nvpair(value, NULL); pair != NULL; pair = nvlist_next_nvpair(value, pair)) { if (propbuf[0] != '\0') (void) strlcat(propbuf, ",", proplen); (void) strlcat(propbuf, nvpair_name(pair), proplen); } return (0); } struct get_clones_arg { uint64_t numclones; nvlist_t *value; const char *origin; char buf[ZFS_MAX_DATASET_NAME_LEN]; }; static int get_clones_cb(zfs_handle_t *zhp, void *arg) { struct get_clones_arg *gca = arg; if (gca->numclones == 0) { zfs_close(zhp); return (0); } if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, gca->buf, sizeof (gca->buf), NULL, NULL, 0, B_TRUE) != 0) goto out; if (strcmp(gca->buf, gca->origin) == 0) { fnvlist_add_boolean(gca->value, zfs_get_name(zhp)); gca->numclones--; } out: (void) zfs_iter_children(zhp, get_clones_cb, gca); zfs_close(zhp); return (0); } nvlist_t * zfs_get_clones_nvl(zfs_handle_t *zhp) { nvlist_t *nv, *value; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_CLONES), &nv) != 0) { struct get_clones_arg gca; /* * if this is a snapshot, then the kernel wasn't able * to get the clones. Do it by slowly iterating. */ if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) return (NULL); if (nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) != 0) return (NULL); if (nvlist_alloc(&value, NV_UNIQUE_NAME, 0) != 0) { nvlist_free(nv); return (NULL); } gca.numclones = zfs_prop_get_int(zhp, ZFS_PROP_NUMCLONES); gca.value = value; gca.origin = zhp->zfs_name; if (gca.numclones != 0) { zfs_handle_t *root; char pool[ZFS_MAX_DATASET_NAME_LEN]; char *cp = pool; /* get the pool name */ (void) strlcpy(pool, zhp->zfs_name, sizeof (pool)); (void) strsep(&cp, "/@"); root = zfs_open(zhp->zfs_hdl, pool, ZFS_TYPE_FILESYSTEM); if (root == NULL) { nvlist_free(nv); nvlist_free(value); return (NULL); } (void) get_clones_cb(root, &gca); } if (gca.numclones != 0 || nvlist_add_nvlist(nv, ZPROP_VALUE, value) != 0 || nvlist_add_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_CLONES), nv) != 0) { nvlist_free(nv); nvlist_free(value); return (NULL); } nvlist_free(nv); nvlist_free(value); nv = fnvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_CLONES)); } return (fnvlist_lookup_nvlist(nv, ZPROP_VALUE)); } static int get_rsnaps_string(zfs_handle_t *zhp, char *propbuf, size_t proplen) { nvlist_t *value; uint64_t *snaps; uint_t nsnaps; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &value) != 0) return (-1); if (nvlist_lookup_uint64_array(value, ZPROP_VALUE, &snaps, &nsnaps) != 0) return (-1); if (nsnaps == 0) { /* There's no redaction snapshots; pass a special value back */ (void) snprintf(propbuf, proplen, "none"); return (0); } propbuf[0] = '\0'; for (int i = 0; i < nsnaps; i++) { char buf[128]; if (propbuf[0] != '\0') (void) strlcat(propbuf, ",", proplen); (void) snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)snaps[i]); (void) strlcat(propbuf, buf, proplen); } return (0); } /* * Accepts a property and value and checks that the value * matches the one found by the channel program. If they are * not equal, print both of them. */ static void zcp_check(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t intval, const char *strval) { if (!zhp->zfs_hdl->libzfs_prop_debug) return; int error; char *poolname = zhp->zpool_hdl->zpool_name; const char *prop_name = zfs_prop_to_name(prop); const char *program = "args = ...\n" "ds = args['dataset']\n" "prop = args['property']\n" "value, setpoint = zfs.get_prop(ds, prop)\n" "return {value=value, setpoint=setpoint}\n"; nvlist_t *outnvl; nvlist_t *retnvl; nvlist_t *argnvl = fnvlist_alloc(); fnvlist_add_string(argnvl, "dataset", zhp->zfs_name); fnvlist_add_string(argnvl, "property", zfs_prop_to_name(prop)); error = lzc_channel_program_nosync(poolname, program, 10 * 1000 * 1000, 10 * 1024 * 1024, argnvl, &outnvl); if (error == 0) { retnvl = fnvlist_lookup_nvlist(outnvl, "return"); if (zfs_prop_get_type(prop) == PROP_TYPE_NUMBER) { int64_t ans; error = nvlist_lookup_int64(retnvl, "value", &ans); if (error != 0) { (void) fprintf(stderr, "%s: zcp check error: " "%u\n", prop_name, error); return; } if (ans != intval) { (void) fprintf(stderr, "%s: zfs found %llu, " "but zcp found %llu\n", prop_name, (u_longlong_t)intval, (u_longlong_t)ans); } } else { char *str_ans; error = nvlist_lookup_string(retnvl, "value", &str_ans); if (error != 0) { (void) fprintf(stderr, "%s: zcp check error: " "%u\n", prop_name, error); return; } if (strcmp(strval, str_ans) != 0) { (void) fprintf(stderr, "%s: zfs found '%s', but zcp found '%s'\n", prop_name, strval, str_ans); } } } else { (void) fprintf(stderr, "%s: zcp check failed, channel program " "error: %u\n", prop_name, error); } nvlist_free(argnvl); nvlist_free(outnvl); } /* * Retrieve a property from the given object. If 'literal' is specified, then * numbers are left as exact values. Otherwise, numbers are converted to a * human-readable form. * * Returns 0 on success, or -1 on error. */ int zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, zprop_source_t *src, char *statbuf, size_t statlen, boolean_t literal) { char *source = NULL; uint64_t val; const char *str; const char *strval; boolean_t received = zfs_is_recvd_props_mode(zhp); /* * Check to see if this property applies to our object */ if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) return (-1); if (received && zfs_prop_readonly(prop)) return (-1); if (src) *src = ZPROP_SRC_NONE; switch (prop) { case ZFS_PROP_CREATION: /* * 'creation' is a time_t stored in the statistics. We convert * this into a string unless 'literal' is specified. */ { val = getprop_uint64(zhp, prop, &source); time_t time = (time_t)val; struct tm t; if (literal || localtime_r(&time, &t) == NULL || strftime(propbuf, proplen, "%a %b %e %k:%M %Y", &t) == 0) (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_MOUNTPOINT: /* * Getting the precise mountpoint can be tricky. * * - for 'none' or 'legacy', return those values. * - for inherited mountpoints, we want to take everything * after our ancestor and append it to the inherited value. * * If the pool has an alternate root, we want to prepend that * root to any values we return. */ str = getprop_string(zhp, prop, &source); if (str[0] == '/') { char buf[MAXPATHLEN]; char *root = buf; const char *relpath; /* * If we inherit the mountpoint, even from a dataset * with a received value, the source will be the path of * the dataset we inherit from. If source is * ZPROP_SOURCE_VAL_RECVD, the received value is not * inherited. */ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) { relpath = ""; } else { relpath = zhp->zfs_name + strlen(source); if (relpath[0] == '/') relpath++; } if ((zpool_get_prop(zhp->zpool_hdl, ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL, B_FALSE)) || (strcmp(root, "-") == 0)) root[0] = '\0'; /* * Special case an alternate root of '/'. This will * avoid having multiple leading slashes in the * mountpoint path. */ if (strcmp(root, "/") == 0) root++; /* * If the mountpoint is '/' then skip over this * if we are obtaining either an alternate root or * an inherited mountpoint. */ if (str[1] == '\0' && (root[0] != '\0' || relpath[0] != '\0')) str++; if (relpath[0] == '\0') (void) snprintf(propbuf, proplen, "%s%s", root, str); else (void) snprintf(propbuf, proplen, "%s%s%s%s", root, str, relpath[0] == '@' ? "" : "/", relpath); } else { /* 'legacy' or 'none' */ (void) strlcpy(propbuf, str, proplen); } zcp_check(zhp, prop, 0, propbuf); break; case ZFS_PROP_ORIGIN: str = getprop_string(zhp, prop, &source); if (str == NULL) return (-1); (void) strlcpy(propbuf, str, proplen); zcp_check(zhp, prop, 0, str); break; case ZFS_PROP_REDACT_SNAPS: if (get_rsnaps_string(zhp, propbuf, proplen) != 0) return (-1); break; case ZFS_PROP_CLONES: if (get_clones_string(zhp, propbuf, proplen) != 0) return (-1); break; case ZFS_PROP_QUOTA: case ZFS_PROP_REFQUOTA: case ZFS_PROP_RESERVATION: case ZFS_PROP_REFRESERVATION: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); /* * If quota or reservation is 0, we translate this into 'none' * (unless literal is set), and indicate that it's the default * value. Otherwise, we print the number nicely and indicate * that its set locally. */ if (val == 0) { if (literal) (void) strlcpy(propbuf, "0", proplen); else (void) strlcpy(propbuf, "none", proplen); } else { if (literal) (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); else zfs_nicebytes(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: case ZFS_PROP_FILESYSTEM_COUNT: case ZFS_PROP_SNAPSHOT_COUNT: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); /* * If limit is UINT64_MAX, we translate this into 'none', and * indicate that it's the default value. Otherwise, we print * the number nicely and indicate that it's set locally. */ if (val == UINT64_MAX) { (void) strlcpy(propbuf, "none", proplen); } else if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } else { zfs_nicenum(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_REFRATIO: case ZFS_PROP_COMPRESSRATIO: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); if (literal) (void) snprintf(propbuf, proplen, "%llu.%02llu", (u_longlong_t)(val / 100), (u_longlong_t)(val % 100)); else (void) snprintf(propbuf, proplen, "%llu.%02llux", (u_longlong_t)(val / 100), (u_longlong_t)(val % 100)); zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_TYPE: switch (zhp->zfs_type) { case ZFS_TYPE_FILESYSTEM: str = "filesystem"; break; case ZFS_TYPE_VOLUME: str = "volume"; break; case ZFS_TYPE_SNAPSHOT: str = "snapshot"; break; case ZFS_TYPE_BOOKMARK: str = "bookmark"; break; default: abort(); } (void) snprintf(propbuf, proplen, "%s", str); zcp_check(zhp, prop, 0, propbuf); break; case ZFS_PROP_MOUNTED: /* * The 'mounted' property is a pseudo-property that described * whether the filesystem is currently mounted. Even though * it's a boolean value, the typical values of "on" and "off" * don't make sense, so we translate to "yes" and "no". */ if (get_numeric_property(zhp, ZFS_PROP_MOUNTED, src, &source, &val) != 0) return (-1); if (val) (void) strlcpy(propbuf, "yes", proplen); else (void) strlcpy(propbuf, "no", proplen); break; case ZFS_PROP_NAME: /* * The 'name' property is a pseudo-property derived from the * dataset name. It is presented as a real property to simplify * consumers. */ (void) strlcpy(propbuf, zhp->zfs_name, proplen); zcp_check(zhp, prop, 0, propbuf); break; case ZFS_PROP_MLSLABEL: { #ifdef HAVE_MLSLABEL m_label_t *new_sl = NULL; char *ascii = NULL; /* human readable label */ (void) strlcpy(propbuf, getprop_string(zhp, prop, &source), proplen); if (literal || (strcasecmp(propbuf, ZFS_MLSLABEL_DEFAULT) == 0)) break; /* * Try to translate the internal hex string to * human-readable output. If there are any * problems just use the hex string. */ if (str_to_label(propbuf, &new_sl, MAC_LABEL, L_NO_CORRECTION, NULL) == -1) { m_label_free(new_sl); break; } if (label_to_str(new_sl, &ascii, M_LABEL, DEF_NAMES) != 0) { if (ascii) free(ascii); m_label_free(new_sl); break; } m_label_free(new_sl); (void) strlcpy(propbuf, ascii, proplen); free(ascii); #else (void) strlcpy(propbuf, getprop_string(zhp, prop, &source), proplen); #endif /* HAVE_MLSLABEL */ } break; case ZFS_PROP_GUID: case ZFS_PROP_KEY_GUID: case ZFS_PROP_IVSET_GUID: case ZFS_PROP_CREATETXG: case ZFS_PROP_OBJSETID: case ZFS_PROP_PBKDF2_ITERS: /* * These properties are stored as numbers, but they are * identifiers or counters. * We don't want them to be pretty printed, because pretty * printing truncates their values making them useless. */ if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_REFERENCED: case ZFS_PROP_AVAILABLE: case ZFS_PROP_USED: case ZFS_PROP_USEDSNAP: case ZFS_PROP_USEDDS: case ZFS_PROP_USEDREFRESERV: case ZFS_PROP_USEDCHILD: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } else { zfs_nicebytes(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_SNAPSHOTS_CHANGED: { if ((get_numeric_property(zhp, prop, src, &source, &val) != 0) || val == 0) { return (-1); } time_t time = (time_t)val; struct tm t; if (literal || localtime_r(&time, &t) == NULL || strftime(propbuf, proplen, "%a %b %e %k:%M:%S %Y", &t) == 0) (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } zcp_check(zhp, prop, val, NULL); break; default: switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) { return (-1); } if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } else { zfs_nicenum(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; case PROP_TYPE_STRING: str = getprop_string(zhp, prop, &source); if (str == NULL) return (-1); (void) strlcpy(propbuf, str, proplen); zcp_check(zhp, prop, 0, str); break; case PROP_TYPE_INDEX: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); if (zfs_prop_index_to_string(prop, val, &strval) != 0) return (-1); (void) strlcpy(propbuf, strval, proplen); zcp_check(zhp, prop, 0, strval); break; default: abort(); } } get_source(zhp, src, source, statbuf, statlen); return (0); } /* * Utility function to get the given numeric property. Does no validation that * the given property is the appropriate type; should only be used with * hard-coded property types. */ uint64_t zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop) { char *source; uint64_t val = 0; (void) get_numeric_property(zhp, prop, NULL, &source, &val); return (val); } static int zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val) { char buf[64]; (void) snprintf(buf, sizeof (buf), "%llu", (longlong_t)val); return (zfs_prop_set(zhp, zfs_prop_to_name(prop), buf)); } /* * Similar to zfs_prop_get(), but returns the value as an integer. */ int zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value, zprop_source_t *src, char *statbuf, size_t statlen) { char *source; /* * Check to see if this property applies to our object */ if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) { return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE, dgettext(TEXT_DOMAIN, "cannot get property '%s'"), zfs_prop_to_name(prop))); } if (src) *src = ZPROP_SRC_NONE; if (get_numeric_property(zhp, prop, src, &source, value) != 0) return (-1); get_source(zhp, src, source, statbuf, statlen); return (0); } #ifdef HAVE_IDMAP static int idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser, char **domainp, idmap_rid_t *ridp) { idmap_get_handle_t *get_hdl = NULL; idmap_stat status; int err = EINVAL; if (idmap_get_create(&get_hdl) != IDMAP_SUCCESS) goto out; if (isuser) { err = idmap_get_sidbyuid(get_hdl, id, IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); } else { err = idmap_get_sidbygid(get_hdl, id, IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); } if (err == IDMAP_SUCCESS && idmap_get_mappings(get_hdl) == IDMAP_SUCCESS && status == IDMAP_SUCCESS) err = 0; else err = EINVAL; out: if (get_hdl) idmap_get_destroy(get_hdl); return (err); } #endif /* HAVE_IDMAP */ /* * convert the propname into parameters needed by kernel * Eg: userquota@ahrens -> ZFS_PROP_USERQUOTA, "", 126829 * Eg: userused@matt@domain -> ZFS_PROP_USERUSED, "S-1-123-456", 789 * Eg: groupquota@staff -> ZFS_PROP_GROUPQUOTA, "", 1234 * Eg: groupused@staff -> ZFS_PROP_GROUPUSED, "", 1234 * Eg: projectquota@123 -> ZFS_PROP_PROJECTQUOTA, "", 123 * Eg: projectused@789 -> ZFS_PROP_PROJECTUSED, "", 789 */ static int userquota_propname_decode(const char *propname, boolean_t zoned, zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp) { zfs_userquota_prop_t type; char *cp; boolean_t isuser; boolean_t isgroup; boolean_t isproject; struct passwd *pw; struct group *gr; domain[0] = '\0'; /* Figure out the property type ({user|group|project}{quota|space}) */ for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) { if (strncmp(propname, zfs_userquota_prop_prefixes[type], strlen(zfs_userquota_prop_prefixes[type])) == 0) break; } if (type == ZFS_NUM_USERQUOTA_PROPS) return (EINVAL); *typep = type; isuser = (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_USERUSED || type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_USEROBJUSED); isgroup = (type == ZFS_PROP_GROUPQUOTA || type == ZFS_PROP_GROUPUSED || type == ZFS_PROP_GROUPOBJQUOTA || type == ZFS_PROP_GROUPOBJUSED); isproject = (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTOBJQUOTA || type == ZFS_PROP_PROJECTOBJUSED); cp = strchr(propname, '@') + 1; if (isuser && (pw = getpwnam(cp)) != NULL) { if (zoned && getzoneid() == GLOBAL_ZONEID) return (ENOENT); *ridp = pw->pw_uid; } else if (isgroup && (gr = getgrnam(cp)) != NULL) { if (zoned && getzoneid() == GLOBAL_ZONEID) return (ENOENT); *ridp = gr->gr_gid; } else if (!isproject && strchr(cp, '@')) { #ifdef HAVE_IDMAP /* * It's a SID name (eg "user@domain") that needs to be * turned into S-1-domainID-RID. */ directory_error_t e; char *numericsid = NULL; char *end; if (zoned && getzoneid() == GLOBAL_ZONEID) return (ENOENT); if (isuser) { e = directory_sid_from_user_name(NULL, cp, &numericsid); } else { e = directory_sid_from_group_name(NULL, cp, &numericsid); } if (e != NULL) { directory_error_free(e); return (ENOENT); } if (numericsid == NULL) return (ENOENT); cp = numericsid; (void) strlcpy(domain, cp, domainlen); cp = strrchr(domain, '-'); *cp = '\0'; cp++; errno = 0; *ridp = strtoull(cp, &end, 10); free(numericsid); if (errno != 0 || *end != '\0') return (EINVAL); #else (void) domainlen; return (ENOSYS); #endif /* HAVE_IDMAP */ } else { /* It's a user/group/project ID (eg "12345"). */ uid_t id; char *end; id = strtoul(cp, &end, 10); if (*end != '\0') return (EINVAL); if (id > MAXUID && !isproject) { #ifdef HAVE_IDMAP /* It's an ephemeral ID. */ idmap_rid_t rid; char *mapdomain; if (idmap_id_to_numeric_domain_rid(id, isuser, &mapdomain, &rid) != 0) return (ENOENT); (void) strlcpy(domain, mapdomain, domainlen); *ridp = rid; #else return (ENOSYS); #endif /* HAVE_IDMAP */ } else { *ridp = id; } } return (0); } static int zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue, zfs_userquota_prop_t *typep) { int err; zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); err = userquota_propname_decode(propname, zfs_prop_get_int(zhp, ZFS_PROP_ZONED), typep, zc.zc_value, sizeof (zc.zc_value), &zc.zc_guid); zc.zc_objset_type = *typep; if (err) return (err); err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_USERSPACE_ONE, &zc); if (err) return (err); *propvalue = zc.zc_cookie; return (0); } int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue) { zfs_userquota_prop_t type; return (zfs_prop_get_userquota_common(zhp, propname, propvalue, &type)); } int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal) { int err; uint64_t propvalue; zfs_userquota_prop_t type; err = zfs_prop_get_userquota_common(zhp, propname, &propvalue, &type); if (err) return (err); if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)propvalue); } else if (propvalue == 0 && (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA || type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTOBJQUOTA)) { (void) strlcpy(propbuf, "none", proplen); } else if (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA || type == ZFS_PROP_USERUSED || type == ZFS_PROP_GROUPUSED || type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTQUOTA) { zfs_nicebytes(propvalue, propbuf, proplen); } else { zfs_nicenum(propvalue, propbuf, proplen); } return (0); } /* * propname must start with "written@" or "written#". */ int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue) { int err; zfs_cmd_t zc = {"\0"}; const char *snapname; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); assert(zfs_prop_written(propname)); snapname = propname + strlen("written@"); if (strchr(snapname, '@') != NULL || strchr(snapname, '#') != NULL) { /* full snapshot or bookmark name specified */ (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); } else { /* snapname is the short name, append it to zhp's fsname */ char *cp; (void) strlcpy(zc.zc_value, zhp->zfs_name, sizeof (zc.zc_value)); cp = strchr(zc.zc_value, '@'); if (cp != NULL) *cp = '\0'; (void) strlcat(zc.zc_value, snapname - 1, sizeof (zc.zc_value)); } err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SPACE_WRITTEN, &zc); if (err) return (err); *propvalue = zc.zc_cookie; return (0); } int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal) { int err; uint64_t propvalue; err = zfs_prop_get_written_int(zhp, propname, &propvalue); if (err) return (err); if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)propvalue); } else { zfs_nicebytes(propvalue, propbuf, proplen); } return (0); } /* * Returns the name of the given zfs handle. */ const char * zfs_get_name(const zfs_handle_t *zhp) { return (zhp->zfs_name); } /* * Returns the name of the parent pool for the given zfs handle. */ const char * zfs_get_pool_name(const zfs_handle_t *zhp) { return (zhp->zpool_hdl->zpool_name); } /* * Returns the type of the given zfs handle. */ zfs_type_t zfs_get_type(const zfs_handle_t *zhp) { return (zhp->zfs_type); } /* * Returns the type of the given zfs handle, * or, if a snapshot, the type of the snapshotted dataset. */ zfs_type_t zfs_get_underlying_type(const zfs_handle_t *zhp) { return (zhp->zfs_head_type); } /* * Is one dataset name a child dataset of another? * * Needs to handle these cases: * Dataset 1 "a/foo" "a/foo" "a/foo" "a/foo" * Dataset 2 "a/fo" "a/foobar" "a/bar/baz" "a/foo/bar" * Descendant? No. No. No. Yes. */ static boolean_t is_descendant(const char *ds1, const char *ds2) { size_t d1len = strlen(ds1); /* ds2 can't be a descendant if it's smaller */ if (strlen(ds2) < d1len) return (B_FALSE); /* otherwise, compare strings and verify that there's a '/' char */ return (ds2[d1len] == '/' && (strncmp(ds1, ds2, d1len) == 0)); } /* * Given a complete name, return just the portion that refers to the parent. * Will return -1 if there is no parent (path is just the name of the * pool). */ static int parent_name(const char *path, char *buf, size_t buflen) { char *slashp; (void) strlcpy(buf, path, buflen); if ((slashp = strrchr(buf, '/')) == NULL) return (-1); *slashp = '\0'; return (0); } int zfs_parent_name(zfs_handle_t *zhp, char *buf, size_t buflen) { return (parent_name(zfs_get_name(zhp), buf, buflen)); } /* * If accept_ancestor is false, then check to make sure that the given path has * a parent, and that it exists. If accept_ancestor is true, then find the * closest existing ancestor for the given path. In prefixlen return the * length of already existing prefix of the given path. We also fetch the * 'zoned' property, which is used to validate property settings when creating * new datasets. */ static int check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned, boolean_t accept_ancestor, int *prefixlen) { zfs_cmd_t zc = {"\0"}; char parent[ZFS_MAX_DATASET_NAME_LEN]; char *slash; zfs_handle_t *zhp; char errbuf[ERRBUFLEN]; uint64_t is_zoned; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); /* get parent, and check to see if this is just a pool */ if (parent_name(path, parent, sizeof (parent)) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing dataset name")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } /* check to see if the pool exists */ if ((slash = strchr(parent, '/')) == NULL) slash = parent + strlen(parent); - (void) strncpy(zc.zc_name, parent, slash - parent); - zc.zc_name[slash - parent] = '\0'; + (void) strlcpy(zc.zc_name, parent, + MIN(sizeof (zc.zc_name), slash - parent + 1)); if (zfs_ioctl(hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0 && errno == ENOENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool '%s'"), zc.zc_name); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } /* check to see if the parent dataset exists */ while ((zhp = make_dataset_handle(hdl, parent)) == NULL) { if (errno == ENOENT && accept_ancestor) { /* * Go deeper to find an ancestor, give up on top level. */ if (parent_name(parent, parent, sizeof (parent)) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool '%s'"), zc.zc_name); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } } else if (errno == ENOENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent does not exist")); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } else return (zfs_standard_error(hdl, errno, errbuf)); } is_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); if (zoned != NULL) *zoned = is_zoned; /* we are in a non-global zone, but parent is in the global zone */ if (getzoneid() != GLOBAL_ZONEID && !is_zoned) { (void) zfs_standard_error(hdl, EPERM, errbuf); zfs_close(zhp); return (-1); } /* make sure parent is a filesystem */ if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent is not a filesystem")); (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); zfs_close(zhp); return (-1); } zfs_close(zhp); if (prefixlen != NULL) *prefixlen = strlen(parent); return (0); } /* * Finds whether the dataset of the given type(s) exists. */ boolean_t zfs_dataset_exists(libzfs_handle_t *hdl, const char *path, zfs_type_t types) { zfs_handle_t *zhp; if (!zfs_validate_name(hdl, path, types, B_FALSE)) return (B_FALSE); /* * Try to get stats for the dataset, which will tell us if it exists. */ if ((zhp = make_dataset_handle(hdl, path)) != NULL) { int ds_type = zhp->zfs_type; zfs_close(zhp); if (types & ds_type) return (B_TRUE); } return (B_FALSE); } /* * Given a path to 'target', create all the ancestors between * the prefixlen portion of the path, and the target itself. * Fail if the initial prefixlen-ancestor does not already exist. */ int create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) { zfs_handle_t *h; char *cp; const char *opname; /* make sure prefix exists */ cp = target + prefixlen; if (*cp != '/') { assert(strchr(cp, '/') == NULL); h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); } else { *cp = '\0'; h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); *cp = '/'; } if (h == NULL) return (-1); zfs_close(h); /* * Attempt to create, mount, and share any ancestor filesystems, * up to the prefixlen-long one. */ for (cp = target + prefixlen + 1; (cp = strchr(cp, '/')) != NULL; *cp = '/', cp++) { *cp = '\0'; h = make_dataset_handle(hdl, target); if (h) { /* it already exists, nothing to do here */ zfs_close(h); continue; } if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM, NULL) != 0) { opname = dgettext(TEXT_DOMAIN, "create"); goto ancestorerr; } h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); if (h == NULL) { opname = dgettext(TEXT_DOMAIN, "open"); goto ancestorerr; } if (zfs_mount(h, NULL, 0) != 0) { opname = dgettext(TEXT_DOMAIN, "mount"); goto ancestorerr; } if (zfs_share(h, NULL) != 0) { opname = dgettext(TEXT_DOMAIN, "share"); goto ancestorerr; } zfs_close(h); } zfs_commit_shares(NULL); return (0); ancestorerr: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to %s ancestor '%s'"), opname, target); return (-1); } /* * Creates non-existing ancestors of the given path. */ int zfs_create_ancestors(libzfs_handle_t *hdl, const char *path) { int prefix; char *path_copy; char errbuf[ERRBUFLEN]; int rc = 0; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); /* * Check that we are not passing the nesting limit * before we start creating any ancestors. */ if (dataset_nestcheck(path) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "maximum name nesting depth exceeded")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0) return (-1); if ((path_copy = strdup(path)) != NULL) { rc = create_parents(hdl, path_copy, prefix); free(path_copy); } if (path_copy == NULL || rc != 0) return (-1); return (0); } /* * Create a new filesystem or volume. */ int zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, nvlist_t *props) { int ret; uint64_t size = 0; uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); uint64_t zoned; enum lzc_dataset_type ost; zpool_handle_t *zpool_handle; uint8_t *wkeydata = NULL; uint_t wkeylen = 0; char errbuf[ERRBUFLEN]; char parent[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); /* validate the path, taking care to note the extended error message */ if (!zfs_validate_name(hdl, path, type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); if (dataset_nestcheck(path) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "maximum name nesting depth exceeded")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } /* validate parents exist */ if (check_parents(hdl, path, &zoned, B_FALSE, NULL) != 0) return (-1); /* * The failure modes when creating a dataset of a different type over * one that already exists is a little strange. In particular, if you * try to create a dataset on top of an existing dataset, the ioctl() * will return ENOENT, not EEXIST. To prevent this from happening, we * first try to see if the dataset exists. */ if (zfs_dataset_exists(hdl, path, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset already exists")); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); } if (type == ZFS_TYPE_VOLUME) ost = LZC_DATSET_TYPE_ZVOL; else ost = LZC_DATSET_TYPE_ZFS; /* open zpool handle for prop validation */ char pool_path[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(pool_path, path, sizeof (pool_path)); /* truncate pool_path at first slash */ char *p = strchr(pool_path, '/'); if (p != NULL) *p = '\0'; if ((zpool_handle = zpool_open(hdl, pool_path)) == NULL) return (-1); if (props && (props = zfs_valid_proplist(hdl, type, props, zoned, NULL, zpool_handle, B_TRUE, errbuf)) == 0) { zpool_close(zpool_handle); return (-1); } zpool_close(zpool_handle); if (type == ZFS_TYPE_VOLUME) { /* * If we are creating a volume, the size and block size must * satisfy a few restraints. First, the blocksize must be a * valid block size between SPA_{MIN,MAX}BLOCKSIZE. Second, the * volsize must be a multiple of the block size, and cannot be * zero. */ if (props == NULL || nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &size) != 0) { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing volume size")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } if ((ret = nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &blocksize)) != 0) { if (ret == ENOENT) { blocksize = zfs_prop_default_numeric( ZFS_PROP_VOLBLOCKSIZE); } else { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing volume block size")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } } if (size == 0) { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "volume size cannot be zero")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } if (size % blocksize != 0) { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "volume size must be a multiple of volume block " "size")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } } (void) parent_name(path, parent, sizeof (parent)); if (zfs_crypto_create(hdl, parent, props, NULL, B_TRUE, &wkeydata, &wkeylen) != 0) { nvlist_free(props); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); } /* create the dataset */ ret = lzc_create(path, ost, props, wkeydata, wkeylen); nvlist_free(props); if (wkeydata != NULL) free(wkeydata); /* check for failure */ if (ret != 0) { switch (errno) { case ENOENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such parent '%s'"), parent); return (zfs_error(hdl, EZFS_NOENT, errbuf)); case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to set this " "property or value")); return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); case EACCES: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption root's key is not loaded " "or provided")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); case ERANGE: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property value(s) specified")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); #ifdef _ILP32 case EOVERFLOW: /* * This platform can't address a volume this big. */ if (type == ZFS_TYPE_VOLUME) return (zfs_error(hdl, EZFS_VOLTOOBIG, errbuf)); zfs_fallthrough; #endif default: return (zfs_standard_error(hdl, errno, errbuf)); } } return (0); } /* * Destroys the given dataset. The caller must make sure that the filesystem * isn't mounted, and that there are no active dependents. If the file system * does not exist this function does nothing. */ int zfs_destroy(zfs_handle_t *zhp, boolean_t defer) { int error; if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT && defer) return (EINVAL); if (zhp->zfs_type == ZFS_TYPE_BOOKMARK) { nvlist_t *nv = fnvlist_alloc(); fnvlist_add_boolean(nv, zhp->zfs_name); error = lzc_destroy_bookmarks(nv, NULL); fnvlist_free(nv); if (error != 0) { return (zfs_standard_error_fmt(zhp->zfs_hdl, error, dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), zhp->zfs_name)); } return (0); } if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { nvlist_t *nv = fnvlist_alloc(); fnvlist_add_boolean(nv, zhp->zfs_name); error = lzc_destroy_snaps(nv, defer, NULL); fnvlist_free(nv); } else { error = lzc_destroy(zhp->zfs_name); } if (error != 0 && error != ENOENT) { return (zfs_standard_error_fmt(zhp->zfs_hdl, errno, dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), zhp->zfs_name)); } remove_mountpoint(zhp); return (0); } struct destroydata { nvlist_t *nvl; const char *snapname; }; static int zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) { struct destroydata *dd = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, dd->snapname) >= sizeof (name)) return (EINVAL); if (lzc_exists(name)) fnvlist_add_boolean(dd->nvl, name); rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, dd); zfs_close(zhp); return (rv); } /* * Destroys all snapshots with the given name in zhp & descendants. */ int zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) { int ret; struct destroydata dd = { 0 }; dd.snapname = snapname; dd.nvl = fnvlist_alloc(); (void) zfs_check_snap_cb(zfs_handle_dup(zhp), &dd); if (nvlist_empty(dd.nvl)) { ret = zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT, dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"), zhp->zfs_name, snapname); } else { ret = zfs_destroy_snaps_nvl(zhp->zfs_hdl, dd.nvl, defer); } fnvlist_free(dd.nvl); return (ret); } /* * Destroys all the snapshots named in the nvlist. */ int zfs_destroy_snaps_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, boolean_t defer) { nvlist_t *errlist = NULL; nvpair_t *pair; int ret = zfs_destroy_snaps_nvl_os(hdl, snaps); if (ret != 0) return (ret); ret = lzc_destroy_snaps(snaps, defer, &errlist); if (ret == 0) { nvlist_free(errlist); return (0); } if (nvlist_empty(errlist)) { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot destroy snapshots")); ret = zfs_standard_error(hdl, ret, errbuf); } for (pair = nvlist_next_nvpair(errlist, NULL); pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"), nvpair_name(pair)); switch (fnvpair_value_int32(pair)) { case EEXIST: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshot is cloned")); ret = zfs_error(hdl, EZFS_EXISTS, errbuf); break; default: ret = zfs_standard_error(hdl, errno, errbuf); break; } } nvlist_free(errlist); return (ret); } /* * Clones the given dataset. The target must be of the same type as the source. */ int zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) { char parent[ZFS_MAX_DATASET_NAME_LEN]; int ret; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; uint64_t zoned; assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), target); /* validate the target/clone name */ if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); /* validate parents exist */ if (check_parents(hdl, target, &zoned, B_FALSE, NULL) != 0) return (-1); (void) parent_name(target, parent, sizeof (parent)); /* do the clone */ if (props) { zfs_type_t type = ZFS_TYPE_FILESYSTEM; if (ZFS_IS_VOLUME(zhp)) type = ZFS_TYPE_VOLUME; if ((props = zfs_valid_proplist(hdl, type, props, zoned, zhp, zhp->zpool_hdl, B_TRUE, errbuf)) == NULL) return (-1); if (zfs_fix_auto_resv(zhp, props) == -1) { nvlist_free(props); return (-1); } } if (zfs_crypto_clone_check(hdl, zhp, parent, props) != 0) { nvlist_free(props); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); } ret = lzc_clone(target, zhp->zfs_name, props); nvlist_free(props); if (ret != 0) { switch (errno) { case ENOENT: /* * The parent doesn't exist. We should have caught this * above, but there may a race condition that has since * destroyed the parent. * * At this point, we don't know whether it's the source * that doesn't exist anymore, or whether the target * dataset doesn't exist. */ zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "no such parent '%s'"), parent); return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); case EXDEV: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "source and target pools differ")); return (zfs_error(zhp->zfs_hdl, EZFS_CROSSTARGET, errbuf)); default: return (zfs_standard_error(zhp->zfs_hdl, errno, errbuf)); } } return (ret); } /* * Promotes the given clone fs to be the clone parent. */ int zfs_promote(zfs_handle_t *zhp) { libzfs_handle_t *hdl = zhp->zfs_hdl; char snapname[ZFS_MAX_DATASET_NAME_LEN]; int ret; char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot promote '%s'"), zhp->zfs_name); if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshots can not be promoted")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } if (zhp->zfs_dmustats.dds_origin[0] == '\0') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not a cloned filesystem")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } if (!zfs_validate_name(hdl, zhp->zfs_name, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); ret = lzc_promote(zhp->zfs_name, snapname, sizeof (snapname)); if (ret != 0) { switch (ret) { case EACCES: /* * Promoting encrypted dataset outside its * encryption root. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot promote dataset outside its " "encryption root")); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); case EEXIST: /* There is a conflicting snapshot name. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "conflicting snapshot '%s' from parent '%s'"), snapname, zhp->zfs_dmustats.dds_origin); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); default: return (zfs_standard_error(hdl, ret, errbuf)); } } return (ret); } typedef struct snapdata { nvlist_t *sd_nvl; const char *sd_snapname; } snapdata_t; static int zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) { snapdata_t *sd = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) == 0) { if (snprintf(name, sizeof (name), "%s@%s", zfs_get_name(zhp), sd->sd_snapname) >= sizeof (name)) return (EINVAL); fnvlist_add_boolean(sd->sd_nvl, name); rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); } zfs_close(zhp); return (rv); } /* * Creates snapshots. The keys in the snaps nvlist are the snapshots to be * created. */ int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props) { int ret; char errbuf[ERRBUFLEN]; nvpair_t *elem; nvlist_t *errors; zpool_handle_t *zpool_hdl; char pool[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create snapshots ")); elem = NULL; while ((elem = nvlist_next_nvpair(snaps, elem)) != NULL) { const char *snapname = nvpair_name(elem); /* validate the target name */ if (!zfs_validate_name(hdl, snapname, ZFS_TYPE_SNAPSHOT, B_TRUE)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create snapshot '%s'"), snapname); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } } /* * get pool handle for prop validation. assumes all snaps are in the * same pool, as does lzc_snapshot (below). */ elem = nvlist_next_nvpair(snaps, NULL); (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); pool[strcspn(pool, "/@")] = '\0'; zpool_hdl = zpool_open(hdl, pool); if (zpool_hdl == NULL) return (-1); if (props != NULL && (props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT, props, B_FALSE, NULL, zpool_hdl, B_FALSE, errbuf)) == NULL) { zpool_close(zpool_hdl); return (-1); } zpool_close(zpool_hdl); ret = lzc_snapshot(snaps, props, &errors); if (ret != 0) { boolean_t printed = B_FALSE; for (elem = nvlist_next_nvpair(errors, NULL); elem != NULL; elem = nvlist_next_nvpair(errors, elem)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create snapshot '%s'"), nvpair_name(elem)); (void) zfs_standard_error(hdl, fnvpair_value_int32(elem), errbuf); printed = B_TRUE; } if (!printed) { switch (ret) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "multiple snapshots of same " "fs not allowed")); (void) zfs_error(hdl, EZFS_EXISTS, errbuf); break; default: (void) zfs_standard_error(hdl, ret, errbuf); } } } nvlist_free(props); nvlist_free(errors); return (ret); } int zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, nvlist_t *props) { int ret; snapdata_t sd = { 0 }; char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *cp; zfs_handle_t *zhp; char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot snapshot %s"), path); if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); (void) strlcpy(fsname, path, sizeof (fsname)); cp = strchr(fsname, '@'); *cp = '\0'; sd.sd_snapname = cp + 1; if ((zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { return (-1); } sd.sd_nvl = fnvlist_alloc(); if (recursive) { (void) zfs_snapshot_cb(zfs_handle_dup(zhp), &sd); } else { fnvlist_add_boolean(sd.sd_nvl, path); } ret = zfs_snapshot_nvl(hdl, sd.sd_nvl, props); fnvlist_free(sd.sd_nvl); zfs_close(zhp); return (ret); } /* * Destroy any more recent snapshots. We invoke this callback on any dependents * of the snapshot first. If the 'cb_dependent' member is non-zero, then this * is a dependent and we should just destroy it without checking the transaction * group. */ typedef struct rollback_data { const char *cb_target; /* the snapshot */ uint64_t cb_create; /* creation time reference */ boolean_t cb_error; boolean_t cb_force; } rollback_data_t; static int rollback_destroy_dependent(zfs_handle_t *zhp, void *data) { rollback_data_t *cbp = data; prop_changelist_t *clp; /* We must destroy this clone; first unmount it */ clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, cbp->cb_force ? MS_FORCE: 0); if (clp == NULL || changelist_prefix(clp) != 0) { cbp->cb_error = B_TRUE; zfs_close(zhp); return (0); } if (zfs_destroy(zhp, B_FALSE) != 0) cbp->cb_error = B_TRUE; else changelist_remove(clp, zhp->zfs_name); (void) changelist_postfix(clp); changelist_free(clp); zfs_close(zhp); return (0); } static int rollback_destroy(zfs_handle_t *zhp, void *data) { rollback_data_t *cbp = data; if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { cbp->cb_error |= zfs_iter_dependents(zhp, B_FALSE, rollback_destroy_dependent, cbp); cbp->cb_error |= zfs_destroy(zhp, B_FALSE); } zfs_close(zhp); return (0); } /* * Given a dataset, rollback to a specific snapshot, discarding any * data changes since then and making it the active dataset. * * Any snapshots and bookmarks more recent than the target are * destroyed, along with their dependents (i.e. clones). */ int zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) { rollback_data_t cb = { 0 }; int err; boolean_t restore_resv = 0; uint64_t old_volsize = 0, new_volsize; zfs_prop_t resv_prop = { 0 }; uint64_t min_txg = 0; assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM || zhp->zfs_type == ZFS_TYPE_VOLUME); /* * Destroy all recent snapshots and their dependents. */ cb.cb_force = force; cb.cb_target = snap->zfs_name; cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); if (cb.cb_create > 0) min_txg = cb.cb_create; (void) zfs_iter_snapshots(zhp, B_FALSE, rollback_destroy, &cb, min_txg, 0); (void) zfs_iter_bookmarks(zhp, rollback_destroy, &cb); if (cb.cb_error) return (-1); /* * Now that we have verified that the snapshot is the latest, * rollback to the given snapshot. */ if (zhp->zfs_type == ZFS_TYPE_VOLUME) { if (zfs_which_resv_prop(zhp, &resv_prop) < 0) return (-1); old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); restore_resv = (old_volsize == zfs_prop_get_int(zhp, resv_prop)); } /* * Pass both the filesystem and the wanted snapshot names, * we would get an error back if the snapshot is destroyed or * a new snapshot is created before this request is processed. */ err = lzc_rollback_to(zhp->zfs_name, snap->zfs_name); if (err != 0) { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rollback '%s'"), zhp->zfs_name); switch (err) { case EEXIST: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "there is a snapshot or bookmark more recent " "than '%s'"), snap->zfs_name); (void) zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf); break; case ESRCH: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "'%s' is not found among snapshots of '%s'"), snap->zfs_name, zhp->zfs_name); (void) zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf); break; case EINVAL: (void) zfs_error(zhp->zfs_hdl, EZFS_BADTYPE, errbuf); break; default: (void) zfs_standard_error(zhp->zfs_hdl, err, errbuf); } return (err); } /* * For volumes, if the pre-rollback volsize matched the pre- * rollback reservation and the volsize has changed then set * the reservation property to the post-rollback volsize. * Make a new handle since the rollback closed the dataset. */ if ((zhp->zfs_type == ZFS_TYPE_VOLUME) && (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) { if (restore_resv) { new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); if (old_volsize != new_volsize) err = zfs_prop_set_int(zhp, resv_prop, new_volsize); } zfs_close(zhp); } return (err); } /* * Renames the given dataset. */ int zfs_rename(zfs_handle_t *zhp, const char *target, renameflags_t flags) { int ret = 0; zfs_cmd_t zc = {"\0"}; char *delim; prop_changelist_t *cl = NULL; char parent[ZFS_MAX_DATASET_NAME_LEN]; char property[ZFS_MAXPROPLEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[ERRBUFLEN]; /* if we have the same exact name, just return success */ if (strcmp(zhp->zfs_name, target) == 0) return (0); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rename to '%s'"), target); /* make sure source name is valid */ if (!zfs_validate_name(hdl, zhp->zfs_name, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); /* * Make sure the target name is valid */ if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { if ((strchr(target, '@') == NULL) || *target == '@') { /* * Snapshot target name is abbreviated, * reconstruct full dataset name */ (void) strlcpy(parent, zhp->zfs_name, sizeof (parent)); delim = strchr(parent, '@'); if (strchr(target, '@') == NULL) *(++delim) = '\0'; else *delim = '\0'; (void) strlcat(parent, target, sizeof (parent)); target = parent; } else { /* * Make sure we're renaming within the same dataset. */ delim = strchr(target, '@'); if (strncmp(zhp->zfs_name, target, delim - target) != 0 || zhp->zfs_name[delim - target] != '@') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshots must be part of same " "dataset")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); } } if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } else { if (flags.recursive) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "recursive rename must be a snapshot")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); /* validate parents */ if (check_parents(hdl, target, NULL, B_FALSE, NULL) != 0) return (-1); /* make sure we're in the same pool */ verify((delim = strchr(target, '/')) != NULL); if (strncmp(zhp->zfs_name, target, delim - target) != 0 || zhp->zfs_name[delim - target] != '/') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "datasets must be within same pool")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); } /* new name cannot be a child of the current dataset name */ if (is_descendant(zhp->zfs_name, target)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "New dataset name cannot be a descendant of " "current dataset name")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } } (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zhp->zfs_name); if (getzoneid() == GLOBAL_ZONEID && zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset is used in a non-global zone")); return (zfs_error(hdl, EZFS_ZONED, errbuf)); } /* * Avoid unmounting file systems with mountpoint property set to * 'legacy' or 'none' even if -u option is not given. */ if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM && !flags.recursive && !flags.nounmount && zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, property, sizeof (property), NULL, NULL, 0, B_FALSE) == 0 && (strcmp(property, "legacy") == 0 || strcmp(property, "none") == 0)) { flags.nounmount = B_TRUE; } if (flags.recursive) { char *parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name); delim = strchr(parentname, '@'); *delim = '\0'; zfs_handle_t *zhrp = zfs_open(zhp->zfs_hdl, parentname, ZFS_TYPE_DATASET); free(parentname); if (zhrp == NULL) { ret = -1; goto error; } zfs_close(zhrp); } else if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) { if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, flags.nounmount ? CL_GATHER_DONT_UNMOUNT : CL_GATHER_ITER_MOUNTED, flags.forceunmount ? MS_FORCE : 0)) == NULL) return (-1); if (changelist_haszonedchild(cl)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "child dataset with inherited mountpoint is used " "in a non-global zone")); (void) zfs_error(hdl, EZFS_ZONED, errbuf); ret = -1; goto error; } if ((ret = changelist_prefix(cl)) != 0) goto error; } if (ZFS_IS_VOLUME(zhp)) zc.zc_objset_type = DMU_OST_ZVOL; else zc.zc_objset_type = DMU_OST_ZFS; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value)); zc.zc_cookie = !!flags.recursive; zc.zc_cookie |= (!!flags.nounmount) << 1; if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) { /* * if it was recursive, the one that actually failed will * be in zc.zc_name */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zc.zc_name); if (flags.recursive && errno == EEXIST) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "a child dataset already has a snapshot " "with the new name")); (void) zfs_error(hdl, EZFS_EXISTS, errbuf); } else if (errno == EACCES) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot move encrypted child outside of " "its encryption root")); (void) zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); } else { (void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf); } /* * On failure, we still want to remount any filesystems that * were previously mounted, so we don't alter the system state. */ if (cl != NULL) (void) changelist_postfix(cl); } else { if (cl != NULL) { changelist_rename(cl, zfs_get_name(zhp), target); ret = changelist_postfix(cl); } } error: if (cl != NULL) { changelist_free(cl); } return (ret); } nvlist_t * zfs_get_all_props(zfs_handle_t *zhp) { return (zhp->zfs_props); } nvlist_t * zfs_get_recvd_props(zfs_handle_t *zhp) { if (zhp->zfs_recvd_props == NULL) if (get_recvd_props_ioctl(zhp) != 0) return (NULL); return (zhp->zfs_recvd_props); } nvlist_t * zfs_get_user_props(zfs_handle_t *zhp) { return (zhp->zfs_user_props); } /* * This function is used by 'zfs list' to determine the exact set of columns to * display, and their maximum widths. This does two main things: * * - If this is a list of all properties, then expand the list to include * all native properties, and set a flag so that for each dataset we look * for new unique user properties and add them to the list. * * - For non fixed-width properties, keep track of the maximum width seen * so that we can size the column appropriately. If the user has * requested received property values, we also need to compute the width * of the RECEIVED column. */ int zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received, boolean_t literal) { libzfs_handle_t *hdl = zhp->zfs_hdl; zprop_list_t *entry; zprop_list_t **last, **start; nvlist_t *userprops, *propval; nvpair_t *elem; char *strval; char buf[ZFS_MAXPROPLEN]; if (zprop_expand_list(hdl, plp, ZFS_TYPE_DATASET) != 0) return (-1); userprops = zfs_get_user_props(zhp); entry = *plp; if (entry->pl_all && nvlist_next_nvpair(userprops, NULL) != NULL) { /* * Go through and add any user properties as necessary. We * start by incrementing our list pointer to the first * non-native property. */ start = plp; while (*start != NULL) { if ((*start)->pl_prop == ZPROP_USERPROP) break; start = &(*start)->pl_next; } elem = NULL; while ((elem = nvlist_next_nvpair(userprops, elem)) != NULL) { /* * See if we've already found this property in our list. */ for (last = start; *last != NULL; last = &(*last)->pl_next) { if (strcmp((*last)->pl_user_prop, nvpair_name(elem)) == 0) break; } if (*last == NULL) { entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_user_prop = zfs_strdup(hdl, nvpair_name(elem)); entry->pl_prop = ZPROP_USERPROP; entry->pl_width = strlen(nvpair_name(elem)); entry->pl_all = B_TRUE; *last = entry; } } } /* * Now go through and check the width of any non-fixed columns */ for (entry = *plp; entry != NULL; entry = entry->pl_next) { if (entry->pl_fixed && !literal) continue; if (entry->pl_prop != ZPROP_USERPROP) { if (zfs_prop_get(zhp, entry->pl_prop, buf, sizeof (buf), NULL, NULL, 0, literal) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } if (received && zfs_prop_get_recvd(zhp, zfs_prop_to_name(entry->pl_prop), buf, sizeof (buf), literal) == 0) if (strlen(buf) > entry->pl_recvd_width) entry->pl_recvd_width = strlen(buf); } else { if (nvlist_lookup_nvlist(userprops, entry->pl_user_prop, &propval) == 0) { strval = fnvlist_lookup_string(propval, ZPROP_VALUE); if (strlen(strval) > entry->pl_width) entry->pl_width = strlen(strval); } if (received && zfs_prop_get_recvd(zhp, entry->pl_user_prop, buf, sizeof (buf), literal) == 0) if (strlen(buf) > entry->pl_recvd_width) entry->pl_recvd_width = strlen(buf); } } return (0); } void zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props) { nvpair_t *curr; nvpair_t *next; /* * Keep a reference to the props-table against which we prune the * properties. */ zhp->zfs_props_table = props; curr = nvlist_next_nvpair(zhp->zfs_props, NULL); while (curr) { zfs_prop_t zfs_prop = zfs_name_to_prop(nvpair_name(curr)); next = nvlist_next_nvpair(zhp->zfs_props, curr); /* * User properties will result in ZPROP_USERPROP (an alias * for ZPROP_INVAL), and since we * only know how to prune standard ZFS properties, we always * leave these in the list. This can also happen if we * encounter an unknown DSL property (when running older * software, for example). */ if (zfs_prop != ZPROP_USERPROP && props[zfs_prop] == B_FALSE) (void) nvlist_remove(zhp->zfs_props, nvpair_name(curr), nvpair_type(curr)); curr = next; } } static int zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path, zfs_smb_acl_op_t cmd, char *resource1, char *resource2) { zfs_cmd_t zc = {"\0"}; nvlist_t *nvlist = NULL; int error; (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value)); zc.zc_cookie = (uint64_t)cmd; if (cmd == ZFS_SMB_ACL_RENAME) { if (nvlist_alloc(&nvlist, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (0); } } switch (cmd) { case ZFS_SMB_ACL_ADD: case ZFS_SMB_ACL_REMOVE: (void) strlcpy(zc.zc_string, resource1, sizeof (zc.zc_string)); break; case ZFS_SMB_ACL_RENAME: if (nvlist_add_string(nvlist, ZFS_SMB_ACL_SRC, resource1) != 0) { (void) no_memory(hdl); return (-1); } if (nvlist_add_string(nvlist, ZFS_SMB_ACL_TARGET, resource2) != 0) { (void) no_memory(hdl); return (-1); } zcmd_write_src_nvlist(hdl, &zc, nvlist); break; case ZFS_SMB_ACL_PURGE: break; default: return (-1); } error = ioctl(hdl->libzfs_fd, ZFS_IOC_SMB_ACL, &zc); nvlist_free(nvlist); return (error); } int zfs_smb_acl_add(libzfs_handle_t *hdl, char *dataset, char *path, char *resource) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_ADD, resource, NULL)); } int zfs_smb_acl_remove(libzfs_handle_t *hdl, char *dataset, char *path, char *resource) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_REMOVE, resource, NULL)); } int zfs_smb_acl_purge(libzfs_handle_t *hdl, char *dataset, char *path) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_PURGE, NULL, NULL)); } int zfs_smb_acl_rename(libzfs_handle_t *hdl, char *dataset, char *path, char *oldname, char *newname) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_RENAME, oldname, newname)); } int zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, zfs_userspace_cb_t func, void *arg) { zfs_cmd_t zc = {"\0"}; zfs_useracct_t buf[100]; libzfs_handle_t *hdl = zhp->zfs_hdl; int ret; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_objset_type = type; zc.zc_nvlist_dst = (uintptr_t)buf; for (;;) { zfs_useracct_t *zua = buf; zc.zc_nvlist_dst_size = sizeof (buf); if (zfs_ioctl(hdl, ZFS_IOC_USERSPACE_MANY, &zc) != 0) { if ((errno == ENOTSUP && (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || type == ZFS_PROP_PROJECTOBJUSED || type == ZFS_PROP_PROJECTOBJQUOTA || type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTQUOTA))) break; return (zfs_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot get used/quota for %s"), zc.zc_name)); } if (zc.zc_nvlist_dst_size == 0) break; while (zc.zc_nvlist_dst_size > 0) { if ((ret = func(arg, zua->zu_domain, zua->zu_rid, zua->zu_space)) != 0) return (ret); zua++; zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t); } } return (0); } struct holdarg { nvlist_t *nvl; const char *snapname; const char *tag; boolean_t recursive; int error; }; static int zfs_hold_one(zfs_handle_t *zhp, void *arg) { struct holdarg *ha = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, ha->snapname) >= sizeof (name)) return (EINVAL); if (lzc_exists(name)) fnvlist_add_string(ha->nvl, name, ha->tag); if (ha->recursive) rv = zfs_iter_filesystems(zhp, zfs_hold_one, ha); zfs_close(zhp); return (rv); } int zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, boolean_t recursive, int cleanup_fd) { int ret; struct holdarg ha; ha.nvl = fnvlist_alloc(); ha.snapname = snapname; ha.tag = tag; ha.recursive = recursive; (void) zfs_hold_one(zfs_handle_dup(zhp), &ha); if (nvlist_empty(ha.nvl)) { char errbuf[ERRBUFLEN]; fnvlist_free(ha.nvl); ret = ENOENT; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot hold snapshot '%s@%s'"), zhp->zfs_name, snapname); (void) zfs_standard_error(zhp->zfs_hdl, ret, errbuf); return (ret); } ret = zfs_hold_nvl(zhp, cleanup_fd, ha.nvl); fnvlist_free(ha.nvl); return (ret); } int zfs_hold_nvl(zfs_handle_t *zhp, int cleanup_fd, nvlist_t *holds) { int ret; nvlist_t *errors; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[ERRBUFLEN]; nvpair_t *elem; errors = NULL; ret = lzc_hold(holds, cleanup_fd, &errors); if (ret == 0) { /* There may be errors even in the success case. */ fnvlist_free(errors); return (0); } if (nvlist_empty(errors)) { /* no hold-specific errors */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot hold")); switch (ret) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); break; default: (void) zfs_standard_error(hdl, ret, errbuf); } } for (elem = nvlist_next_nvpair(errors, NULL); elem != NULL; elem = nvlist_next_nvpair(errors, elem)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot hold snapshot '%s'"), nvpair_name(elem)); switch (fnvpair_value_int32(elem)) { case E2BIG: /* * Temporary tags wind up having the ds object id * prepended. So even if we passed the length check * above, it's still possible for the tag to wind * up being slightly too long. */ (void) zfs_error(hdl, EZFS_TAGTOOLONG, errbuf); break; case EINVAL: (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case EEXIST: (void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf); break; default: (void) zfs_standard_error(hdl, fnvpair_value_int32(elem), errbuf); } } fnvlist_free(errors); return (ret); } static int zfs_release_one(zfs_handle_t *zhp, void *arg) { struct holdarg *ha = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; nvlist_t *existing_holds; if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, ha->snapname) >= sizeof (name)) { ha->error = EINVAL; rv = EINVAL; } if (lzc_get_holds(name, &existing_holds) != 0) { ha->error = ENOENT; } else if (!nvlist_exists(existing_holds, ha->tag)) { ha->error = ESRCH; } else { nvlist_t *torelease = fnvlist_alloc(); fnvlist_add_boolean(torelease, ha->tag); fnvlist_add_nvlist(ha->nvl, name, torelease); fnvlist_free(torelease); } if (ha->recursive) rv = zfs_iter_filesystems(zhp, zfs_release_one, ha); zfs_close(zhp); return (rv); } int zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, boolean_t recursive) { int ret; struct holdarg ha; nvlist_t *errors = NULL; nvpair_t *elem; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[ERRBUFLEN]; ha.nvl = fnvlist_alloc(); ha.snapname = snapname; ha.tag = tag; ha.recursive = recursive; ha.error = 0; (void) zfs_release_one(zfs_handle_dup(zhp), &ha); if (nvlist_empty(ha.nvl)) { fnvlist_free(ha.nvl); ret = ha.error; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot release hold from snapshot '%s@%s'"), zhp->zfs_name, snapname); if (ret == ESRCH) { (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); } else { (void) zfs_standard_error(hdl, ret, errbuf); } return (ret); } ret = lzc_release(ha.nvl, &errors); fnvlist_free(ha.nvl); if (ret == 0) { /* There may be errors even in the success case. */ fnvlist_free(errors); return (0); } if (nvlist_empty(errors)) { /* no hold-specific errors */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot release")); switch (errno) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; default: (void) zfs_standard_error(hdl, errno, errbuf); } } for (elem = nvlist_next_nvpair(errors, NULL); elem != NULL; elem = nvlist_next_nvpair(errors, elem)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot release hold from snapshot '%s'"), nvpair_name(elem)); switch (fnvpair_value_int32(elem)) { case ESRCH: (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); break; case EINVAL: (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); break; default: (void) zfs_standard_error(hdl, fnvpair_value_int32(elem), errbuf); } } fnvlist_free(errors); return (ret); } int zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zfs_hdl; int nvsz = 2048; void *nvbuf; int err = 0; char errbuf[ERRBUFLEN]; assert(zhp->zfs_type == ZFS_TYPE_VOLUME || zhp->zfs_type == ZFS_TYPE_FILESYSTEM); tryagain: nvbuf = malloc(nvsz); if (nvbuf == NULL) { err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno))); goto out; } zc.zc_nvlist_dst_size = nvsz; zc.zc_nvlist_dst = (uintptr_t)nvbuf; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_GET_FSACL, &zc) != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"), zc.zc_name); switch (errno) { case ENOMEM: free(nvbuf); nvsz = zc.zc_nvlist_dst_size; goto tryagain; case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); err = zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: err = zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case ENOENT: err = zfs_error(hdl, EZFS_NOENT, errbuf); break; default: err = zfs_standard_error(hdl, errno, errbuf); break; } } else { /* success */ int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0); if (rc) { err = zfs_standard_error_fmt(hdl, rc, dgettext( TEXT_DOMAIN, "cannot get permissions on '%s'"), zc.zc_name); } } free(nvbuf); out: return (err); } int zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zfs_hdl; char *nvbuf; char errbuf[ERRBUFLEN]; size_t nvsz; int err; assert(zhp->zfs_type == ZFS_TYPE_VOLUME || zhp->zfs_type == ZFS_TYPE_FILESYSTEM); err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE); assert(err == 0); nvbuf = malloc(nvsz); err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0); assert(err == 0); zc.zc_nvlist_src_size = nvsz; zc.zc_nvlist_src = (uintptr_t)nvbuf; zc.zc_perm_action = un; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_SET_FSACL, &zc) != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set permissions on '%s'"), zc.zc_name); switch (errno) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); err = zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: err = zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case ENOENT: err = zfs_error(hdl, EZFS_NOENT, errbuf); break; default: err = zfs_standard_error(hdl, errno, errbuf); break; } } free(nvbuf); return (err); } int zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) { int err; char errbuf[ERRBUFLEN]; err = lzc_get_holds(zhp->zfs_name, nvl); if (err != 0) { libzfs_handle_t *hdl = zhp->zfs_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"), zhp->zfs_name); switch (err) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); err = zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: err = zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case ENOENT: err = zfs_error(hdl, EZFS_NOENT, errbuf); break; default: err = zfs_standard_error(hdl, errno, errbuf); break; } } return (err); } /* * The theory of raidz space accounting * * The "referenced" property of RAIDZ vdevs is scaled such that a 128KB block * will "reference" 128KB, even though it allocates more than that, to store the * parity information (and perhaps skip sectors). This concept of the * "referenced" (and other DMU space accounting) being lower than the allocated * space by a constant factor is called "raidz deflation." * * As mentioned above, the constant factor for raidz deflation assumes a 128KB * block size. However, zvols typically have a much smaller block size (default * 8KB). These smaller blocks may require proportionally much more parity * information (and perhaps skip sectors). In this case, the change to the * "referenced" property may be much more than the logical block size. * * Suppose a raidz vdev has 5 disks with ashift=12. A 128k block may be written * as follows. * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | D8 | D16 | D24 | * | P1 | D1 | D9 | D17 | D25 | * | P2 | D2 | D10 | D18 | D26 | * | P3 | D3 | D11 | D19 | D27 | * | P4 | D4 | D12 | D20 | D28 | * | P5 | D5 | D13 | D21 | D29 | * | P6 | D6 | D14 | D22 | D30 | * | P7 | D7 | D15 | D23 | D31 | * +-------+-------+-------+-------+-------+ * * Above, notice that 160k was allocated: 8 x 4k parity sectors + 32 x 4k data * sectors. The dataset's referenced will increase by 128k and the pool's * allocated and free properties will be adjusted by 160k. * * A 4k block written to the same raidz vdev will require two 4k sectors. The * blank cells represent unallocated space. * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | | | | * +-------+-------+-------+-------+-------+ * * Above, notice that the 4k block required one sector for parity and another * for data. vdev_raidz_asize() will return 8k and as such the pool's allocated * and free properties will be adjusted by 8k. The dataset will not be charged * 8k. Rather, it will be charged a value that is scaled according to the * overhead of the 128k block on the same vdev. This 8k allocation will be * charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as * calculated in the 128k block example above. * * Every raidz allocation is sized to be a multiple of nparity+1 sectors. That * is, every raidz1 allocation will be a multiple of 2 sectors, raidz2 * allocations are a multiple of 3 sectors, and raidz3 allocations are a * multiple of of 4 sectors. When a block does not fill the required number of * sectors, skip blocks (sectors) are used. * * An 8k block being written to a raidz vdev may be written as follows: * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | D1 | S0 | | * +-------+-------+-------+-------+-------+ * * In order to maintain the nparity+1 allocation size, a skip block (S0) was * added. For this 8k block, the pool's allocated and free properties are * adjusted by 16k and the dataset's referenced is increased by 16k * 128k / * 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in * the 128k block example above. * * The situation is slightly different for dRAID since the minimum allocation * size is the full group width. The same 8K block above would be written as * follows in a dRAID group: * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | D1 | S0 | S1 | * +-------+-------+-------+-------+-------+ * * Compression may lead to a variety of block sizes being written for the same * volume or file. There is no clear way to reserve just the amount of space * that will be required, so the worst case (no compression) is assumed. * Note that metadata blocks will typically be compressed, so the reservation * size returned by zvol_volsize_to_reservation() will generally be slightly * larger than the maximum that the volume can reference. */ /* * Derived from function of same name in module/zfs/vdev_raidz.c. Returns the * amount of space (in bytes) that will be allocated for the specified block * size. Note that the "referenced" space accounted will be less than this, but * not necessarily equal to "blksize", due to RAIDZ deflation. */ static uint64_t vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, uint64_t blksize) { uint64_t asize, ndata; ASSERT3U(ndisks, >, nparity); ndata = ndisks - nparity; asize = ((blksize - 1) >> ashift) + 1; asize += nparity * ((asize + ndata - 1) / ndata); asize = roundup(asize, nparity + 1) << ashift; return (asize); } /* * Derived from function of same name in module/zfs/vdev_draid.c. Returns the * amount of space (in bytes) that will be allocated for the specified block * size. */ static uint64_t vdev_draid_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, uint64_t blksize) { ASSERT3U(ndisks, >, nparity); uint64_t ndata = ndisks - nparity; uint64_t rows = ((blksize - 1) / (ndata << ashift)) + 1; uint64_t asize = (rows * ndisks) << ashift; return (asize); } /* * Determine how much space will be allocated if it lands on the most space- * inefficient top-level vdev. Returns the size in bytes required to store one * copy of the volume data. See theory comment above. */ static uint64_t volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) { nvlist_t *config, *tree, **vdevs; uint_t nvdevs; uint64_t ret = 0; config = zpool_get_config(zhp, NULL); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &vdevs, &nvdevs) != 0) { return (nblocks * blksize); } for (int v = 0; v < nvdevs; v++) { char *type; uint64_t nparity, ashift, asize, tsize; uint64_t volsize; if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE, &type) != 0) continue; if (strcmp(type, VDEV_TYPE_RAIDZ) != 0 && strcmp(type, VDEV_TYPE_DRAID) != 0) continue; if (nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY, &nparity) != 0) continue; if (nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT, &ashift) != 0) continue; if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { nvlist_t **disks; uint_t ndisks; if (nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN, &disks, &ndisks) != 0) continue; /* allocation size for the "typical" 128k block */ tsize = vdev_raidz_asize(ndisks, nparity, ashift, SPA_OLD_MAXBLOCKSIZE); /* allocation size for the blksize block */ asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize); } else { uint64_t ndata; if (nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_DRAID_NDATA, &ndata) != 0) continue; /* allocation size for the "typical" 128k block */ tsize = vdev_draid_asize(ndata + nparity, nparity, ashift, SPA_OLD_MAXBLOCKSIZE); /* allocation size for the blksize block */ asize = vdev_draid_asize(ndata + nparity, nparity, ashift, blksize); } /* * Scale this size down as a ratio of 128k / tsize. * See theory statement above. */ volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize; if (volsize > ret) { ret = volsize; } } if (ret == 0) { ret = nblocks * blksize; } return (ret); } /* * Convert the zvol's volume size to an appropriate reservation. See theory * comment above. * * Note: If this routine is updated, it is necessary to update the ZFS test * suite's shell version in reservation.shlib. */ uint64_t zvol_volsize_to_reservation(zpool_handle_t *zph, uint64_t volsize, nvlist_t *props) { uint64_t numdb; uint64_t nblocks, volblocksize; int ncopies; char *strval; if (nvlist_lookup_string(props, zfs_prop_to_name(ZFS_PROP_COPIES), &strval) == 0) ncopies = atoi(strval); else ncopies = 1; if (nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) volblocksize = ZVOL_DEFAULT_BLOCKSIZE; nblocks = volsize / volblocksize; /* * Metadata defaults to using 128k blocks, not volblocksize blocks. For * this reason, only the data blocks are scaled based on vdev config. */ volsize = volsize_from_vdevs(zph, nblocks, volblocksize); /* start with metadnode L0-L6 */ numdb = 7; /* calculate number of indirects */ while (nblocks > 1) { nblocks += DNODES_PER_LEVEL - 1; nblocks /= DNODES_PER_LEVEL; numdb += nblocks; } numdb *= MIN(SPA_DVAS_PER_BP, ncopies + 1); volsize *= ncopies; /* * this is exactly DN_MAX_INDBLKSHIFT when metadata isn't * compressed, but in practice they compress down to about * 1100 bytes */ numdb *= 1ULL << DN_MAX_INDBLKSHIFT; volsize += numdb; return (volsize); } /* * Wait for the given activity and return the status of the wait (whether or not * any waiting was done) in the 'waited' parameter. Non-existent fses are * reported via the 'missing' parameter, rather than by printing an error * message. This is convenient when this function is called in a loop over a * long period of time (as it is, for example, by zfs's wait cmd). In that * scenario, a fs being exported or destroyed should be considered a normal * event, so we don't want to print an error when we find that the fs doesn't * exist. */ int zfs_wait_status(zfs_handle_t *zhp, zfs_wait_activity_t activity, boolean_t *missing, boolean_t *waited) { int error = lzc_wait_fs(zhp->zfs_name, activity, waited); *missing = (error == ENOENT); if (*missing) return (0); if (error != 0) { (void) zfs_standard_error_fmt(zhp->zfs_hdl, error, dgettext(TEXT_DOMAIN, "error waiting in fs '%s'"), zhp->zfs_name); } return (error); } diff --git a/lib/libzfs/libzfs_diff.c b/lib/libzfs/libzfs_diff.c index 93f6e19e9127..80588a860c18 100644 --- a/lib/libzfs/libzfs_diff.c +++ b/lib/libzfs/libzfs_diff.c @@ -1,790 +1,788 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2015, 2018 by Delphix. All rights reserved. * Copyright 2016 Joyent, Inc. * Copyright 2016 Igor Kozhukhov */ /* * zfs diff support */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "libzfs_impl.h" #define ZDIFF_SNAPDIR "/.zfs/snapshot/" #define ZDIFF_PREFIX "zfs-diff-%d" #define ZDIFF_ADDED '+' #define ZDIFF_MODIFIED "M" #define ZDIFF_REMOVED '-' #define ZDIFF_RENAMED "R" /* * Given a {dsname, object id}, get the object path */ static int get_stats_for_obj(differ_info_t *di, const char *dsname, uint64_t obj, char *pn, int maxlen, zfs_stat_t *sb) { zfs_cmd_t zc = {"\0"}; int error; (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name)); zc.zc_obj = obj; errno = 0; error = zfs_ioctl(di->zhp->zfs_hdl, ZFS_IOC_OBJ_TO_STATS, &zc); di->zerr = errno; /* we can get stats even if we failed to get a path */ (void) memcpy(sb, &zc.zc_stat, sizeof (zfs_stat_t)); if (error == 0) { ASSERT(di->zerr == 0); (void) strlcpy(pn, zc.zc_value, maxlen); return (0); } if (di->zerr == ESTALE) { (void) snprintf(pn, maxlen, "(on_delete_queue)"); return (0); } else if (di->zerr == EPERM) { (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "The sys_config privilege or diff delegated permission " "is needed\nto discover path names")); return (-1); } else if (di->zerr == EACCES) { (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "Key must be loaded to discover path names")); return (-1); } else { (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "Unable to determine path or stats for " "object %lld in %s"), (longlong_t)obj, dsname); return (-1); } } /* * stream_bytes * * Prints a file name out a character at a time. If the character is * not in the range of what we consider "printable" ASCII, display it * as an escaped 4-digit octal value. ASCII values less than a space * are all control characters and we declare the upper end as the * DELete character. This also is the last 7-bit ASCII character. * We choose to treat all 8-bit ASCII as not printable for this * application. */ static void stream_bytes(FILE *fp, const char *string) { char c; while ((c = *string++) != '\0') { if (c > ' ' && c != '\\' && c < '\177') { (void) fputc(c, fp); } else { (void) fprintf(fp, "\\%04hho", (uint8_t)c); } } } static char get_what(mode_t what) { switch (what & S_IFMT) { case S_IFBLK: return ('B'); case S_IFCHR: return ('C'); case S_IFDIR: return ('/'); #ifdef S_IFDOOR case S_IFDOOR: return ('>'); #endif case S_IFIFO: return ('|'); case S_IFLNK: return ('@'); #ifdef S_IFPORT case S_IFPORT: return ('P'); #endif case S_IFSOCK: return ('='); case S_IFREG: return ('F'); default: return ('?'); } } static void print_cmn(FILE *fp, differ_info_t *di, const char *file) { if (!di->no_mangle) { stream_bytes(fp, di->dsmnt); stream_bytes(fp, file); } else { (void) fputs(di->dsmnt, fp); (void) fputs(file, fp); } } static void print_rename(FILE *fp, differ_info_t *di, const char *old, const char *new, zfs_stat_t *isb) { if (di->timestamped) (void) fprintf(fp, "%10lld.%09lld\t", (longlong_t)isb->zs_ctime[0], (longlong_t)isb->zs_ctime[1]); (void) fputs(ZDIFF_RENAMED "\t", fp); if (di->classify) (void) fprintf(fp, "%c\t", get_what(isb->zs_mode)); print_cmn(fp, di, old); (void) fputs(di->scripted ? "\t" : " -> ", fp); print_cmn(fp, di, new); (void) fputc('\n', fp); } static void print_link_change(FILE *fp, differ_info_t *di, int delta, const char *file, zfs_stat_t *isb) { if (di->timestamped) (void) fprintf(fp, "%10lld.%09lld\t", (longlong_t)isb->zs_ctime[0], (longlong_t)isb->zs_ctime[1]); (void) fputs(ZDIFF_MODIFIED "\t", fp); if (di->classify) (void) fprintf(fp, "%c\t", get_what(isb->zs_mode)); print_cmn(fp, di, file); (void) fprintf(fp, "\t(%+d)\n", delta); } static void print_file(FILE *fp, differ_info_t *di, char type, const char *file, zfs_stat_t *isb) { if (di->timestamped) (void) fprintf(fp, "%10lld.%09lld\t", (longlong_t)isb->zs_ctime[0], (longlong_t)isb->zs_ctime[1]); (void) fprintf(fp, "%c\t", type); if (di->classify) (void) fprintf(fp, "%c\t", get_what(isb->zs_mode)); print_cmn(fp, di, file); (void) fputc('\n', fp); } static int write_inuse_diffs_one(FILE *fp, differ_info_t *di, uint64_t dobj) { struct zfs_stat fsb, tsb; mode_t fmode, tmode; char fobjname[MAXPATHLEN], tobjname[MAXPATHLEN]; boolean_t already_logged = B_FALSE; int fobjerr, tobjerr; int change; if (dobj == di->shares) return (0); /* * Check the from and to snapshots for info on the object. If * we get ENOENT, then the object just didn't exist in that * snapshot. If we get ENOTSUP, then we tried to get * info on a non-ZPL object, which we don't care about anyway. * For any other error we print a warning which includes the * errno and continue. */ fobjerr = get_stats_for_obj(di, di->fromsnap, dobj, fobjname, MAXPATHLEN, &fsb); if (fobjerr && di->zerr != ENOTSUP && di->zerr != ENOENT) { zfs_error_aux(di->zhp->zfs_hdl, "%s", strerror(di->zerr)); zfs_error(di->zhp->zfs_hdl, di->zerr, di->errbuf); /* * Let's not print an error for the same object more than * once if it happens in both snapshots */ already_logged = B_TRUE; } tobjerr = get_stats_for_obj(di, di->tosnap, dobj, tobjname, MAXPATHLEN, &tsb); if (tobjerr && di->zerr != ENOTSUP && di->zerr != ENOENT) { if (!already_logged) { zfs_error_aux(di->zhp->zfs_hdl, "%s", strerror(di->zerr)); zfs_error(di->zhp->zfs_hdl, di->zerr, di->errbuf); } } /* * Unallocated object sharing the same meta dnode block */ if (fobjerr && tobjerr) { di->zerr = 0; return (0); } di->zerr = 0; /* negate get_stats_for_obj() from side that failed */ fmode = fsb.zs_mode & S_IFMT; tmode = tsb.zs_mode & S_IFMT; if (fmode == S_IFDIR || tmode == S_IFDIR || fsb.zs_links == 0 || tsb.zs_links == 0) change = 0; else change = tsb.zs_links - fsb.zs_links; if (fobjerr) { if (change) { print_link_change(fp, di, change, tobjname, &tsb); return (0); } print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb); return (0); } else if (tobjerr) { if (change) { print_link_change(fp, di, change, fobjname, &fsb); return (0); } print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb); return (0); } if (fmode != tmode && fsb.zs_gen == tsb.zs_gen) tsb.zs_gen++; /* Force a generational difference */ /* Simple modification or no change */ if (fsb.zs_gen == tsb.zs_gen) { /* No apparent changes. Could we assert !this? */ if (fsb.zs_ctime[0] == tsb.zs_ctime[0] && fsb.zs_ctime[1] == tsb.zs_ctime[1]) return (0); if (change) { print_link_change(fp, di, change, change > 0 ? fobjname : tobjname, &tsb); } else if (strcmp(fobjname, tobjname) == 0) { print_file(fp, di, *ZDIFF_MODIFIED, fobjname, &tsb); } else { print_rename(fp, di, fobjname, tobjname, &tsb); } return (0); } else { /* file re-created or object re-used */ print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb); print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb); return (0); } } static int write_inuse_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr) { uint64_t o; int err; for (o = dr->ddr_first; o <= dr->ddr_last; o++) { if ((err = write_inuse_diffs_one(fp, di, o)) != 0) return (err); } return (0); } static int describe_free(FILE *fp, differ_info_t *di, uint64_t object, char *namebuf, int maxlen) { struct zfs_stat sb; (void) get_stats_for_obj(di, di->fromsnap, object, namebuf, maxlen, &sb); /* Don't print if in the delete queue on from side */ if (di->zerr == ESTALE || di->zerr == ENOENT) { di->zerr = 0; return (0); } print_file(fp, di, ZDIFF_REMOVED, namebuf, &sb); return (0); } static int write_free_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *lhdl = di->zhp->zfs_hdl; char fobjname[MAXPATHLEN]; (void) strlcpy(zc.zc_name, di->fromsnap, sizeof (zc.zc_name)); zc.zc_obj = dr->ddr_first - 1; ASSERT(di->zerr == 0); while (zc.zc_obj < dr->ddr_last) { int err; err = zfs_ioctl(lhdl, ZFS_IOC_NEXT_OBJ, &zc); if (err == 0) { if (zc.zc_obj == di->shares) { zc.zc_obj++; continue; } if (zc.zc_obj > dr->ddr_last) { break; } err = describe_free(fp, di, zc.zc_obj, fobjname, MAXPATHLEN); } else if (errno == ESRCH) { break; } else { (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "next allocated object (> %lld) find failure"), (longlong_t)zc.zc_obj); di->zerr = errno; break; } } if (di->zerr) return (-1); return (0); } static void * differ(void *arg) { differ_info_t *di = arg; dmu_diff_record_t dr; FILE *ofp; int err = 0; if ((ofp = fdopen(di->outputfd, "w")) == NULL) { di->zerr = errno; strlcpy(di->errbuf, strerror(errno), sizeof (di->errbuf)); (void) close(di->datafd); return ((void *)-1); } for (;;) { char *cp = (char *)&dr; int len = sizeof (dr); int rv; do { rv = read(di->datafd, cp, len); cp += rv; len -= rv; } while (len > 0 && rv > 0); if (rv < 0 || (rv == 0 && len != sizeof (dr))) { di->zerr = EPIPE; break; } else if (rv == 0) { /* end of file at a natural breaking point */ break; } switch (dr.ddr_type) { case DDR_FREE: err = write_free_diffs(ofp, di, &dr); break; case DDR_INUSE: err = write_inuse_diffs(ofp, di, &dr); break; default: di->zerr = EPIPE; break; } if (err || di->zerr) break; } (void) fclose(ofp); (void) close(di->datafd); if (err) return ((void *)-1); if (di->zerr) { ASSERT(di->zerr == EPIPE); (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "Internal error: bad data from diff IOCTL")); return ((void *)-1); } return ((void *)0); } static int make_temp_snapshot(differ_info_t *di) { libzfs_handle_t *hdl = di->zhp->zfs_hdl; zfs_cmd_t zc = {"\0"}; (void) snprintf(zc.zc_value, sizeof (zc.zc_value), ZDIFF_PREFIX, getpid()); (void) strlcpy(zc.zc_name, di->ds, sizeof (zc.zc_name)); zc.zc_cleanup_fd = di->cleanupfd; if (zfs_ioctl(hdl, ZFS_IOC_TMP_SNAPSHOT, &zc) != 0) { int err = errno; if (err == EPERM) { (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "The diff delegated " "permission is needed in order\nto create a " "just-in-time snapshot for diffing\n")); return (zfs_error(hdl, EZFS_DIFF, di->errbuf)); } else { (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "Cannot create just-in-time " "snapshot of '%s'"), zc.zc_name); return (zfs_standard_error(hdl, err, di->errbuf)); } } di->tmpsnap = zfs_strdup(hdl, zc.zc_value); di->tosnap = zfs_asprintf(hdl, "%s@%s", di->ds, di->tmpsnap); return (0); } static void teardown_differ_info(differ_info_t *di) { free(di->ds); free(di->dsmnt); free(di->fromsnap); free(di->frommnt); free(di->tosnap); free(di->tmpsnap); free(di->tomnt); (void) close(di->cleanupfd); } static int get_snapshot_names(differ_info_t *di, const char *fromsnap, const char *tosnap) { libzfs_handle_t *hdl = di->zhp->zfs_hdl; char *atptrf = NULL; char *atptrt = NULL; int fdslen, fsnlen; int tdslen, tsnlen; /* * Can accept * fdslen fsnlen tdslen tsnlen * dataset@snap1 * 0. dataset@snap1 dataset@snap2 >0 >1 >0 >1 * 1. dataset@snap1 @snap2 >0 >1 ==0 >1 * 2. dataset@snap1 dataset >0 >1 >0 ==0 * 3. @snap1 dataset@snap2 ==0 >1 >0 >1 * 4. @snap1 dataset ==0 >1 >0 ==0 */ if (tosnap == NULL) { /* only a from snapshot given, must be valid */ (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "Badly formed snapshot name %s"), fromsnap); if (!zfs_validate_name(hdl, fromsnap, ZFS_TYPE_SNAPSHOT, B_FALSE)) { return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf)); } atptrf = strchr(fromsnap, '@'); ASSERT(atptrf != NULL); fdslen = atptrf - fromsnap; di->fromsnap = zfs_strdup(hdl, fromsnap); di->ds = zfs_strdup(hdl, fromsnap); di->ds[fdslen] = '\0'; /* the to snap will be a just-in-time snap of the head */ return (make_temp_snapshot(di)); } (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "Unable to determine which snapshots to compare")); atptrf = strchr(fromsnap, '@'); atptrt = strchr(tosnap, '@'); fdslen = atptrf ? atptrf - fromsnap : strlen(fromsnap); tdslen = atptrt ? atptrt - tosnap : strlen(tosnap); fsnlen = strlen(fromsnap) - fdslen; /* includes @ sign */ tsnlen = strlen(tosnap) - tdslen; /* includes @ sign */ if (fsnlen <= 1 || tsnlen == 1 || (fdslen == 0 && tdslen == 0)) { return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf)); } else if ((fdslen > 0 && tdslen > 0) && ((tdslen != fdslen || strncmp(fromsnap, tosnap, fdslen) != 0))) { /* * not the same dataset name, might be okay if * tosnap is a clone of a fromsnap descendant. */ char origin[ZFS_MAX_DATASET_NAME_LEN]; zprop_source_t src; zfs_handle_t *zhp; di->ds = zfs_alloc(di->zhp->zfs_hdl, tdslen + 1); - (void) strncpy(di->ds, tosnap, tdslen); - di->ds[tdslen] = '\0'; + (void) strlcpy(di->ds, tosnap, tdslen + 1); zhp = zfs_open(hdl, di->ds, ZFS_TYPE_FILESYSTEM); while (zhp != NULL) { if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, origin, sizeof (origin), &src, NULL, 0, B_FALSE) != 0) { (void) zfs_close(zhp); zhp = NULL; break; } if (strncmp(origin, fromsnap, fsnlen) == 0) break; (void) zfs_close(zhp); zhp = zfs_open(hdl, origin, ZFS_TYPE_FILESYSTEM); } if (zhp == NULL) { (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "Not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf)); } else { (void) zfs_close(zhp); } di->isclone = B_TRUE; di->fromsnap = zfs_strdup(hdl, fromsnap); if (tsnlen) di->tosnap = zfs_strdup(hdl, tosnap); else return (make_temp_snapshot(di)); } else { int dslen = fdslen ? fdslen : tdslen; di->ds = zfs_alloc(hdl, dslen + 1); - (void) strncpy(di->ds, fdslen ? fromsnap : tosnap, dslen); - di->ds[dslen] = '\0'; + (void) strlcpy(di->ds, fdslen ? fromsnap : tosnap, dslen + 1); di->fromsnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrf); if (tsnlen) { di->tosnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrt); } else { return (make_temp_snapshot(di)); } } return (0); } static int get_mountpoint(differ_info_t *di, char *dsnm, char **mntpt) { boolean_t mounted; mounted = is_mounted(di->zhp->zfs_hdl, dsnm, mntpt); if (mounted == B_FALSE) { (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "Cannot diff an unmounted snapshot")); return (zfs_error(di->zhp->zfs_hdl, EZFS_BADTYPE, di->errbuf)); } /* Avoid a double slash at the beginning of root-mounted datasets */ if (**mntpt == '/' && *(*mntpt + 1) == '\0') **mntpt = '\0'; return (0); } static int get_mountpoints(differ_info_t *di) { char *strptr; char *frommntpt; /* * first get the mountpoint for the parent dataset */ if (get_mountpoint(di, di->ds, &di->dsmnt) != 0) return (-1); strptr = strchr(di->tosnap, '@'); ASSERT3P(strptr, !=, NULL); di->tomnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", di->dsmnt, ZDIFF_SNAPDIR, ++strptr); strptr = strchr(di->fromsnap, '@'); ASSERT3P(strptr, !=, NULL); frommntpt = di->dsmnt; if (di->isclone) { char *mntpt; int err; *strptr = '\0'; err = get_mountpoint(di, di->fromsnap, &mntpt); *strptr = '@'; if (err != 0) return (-1); frommntpt = mntpt; } di->frommnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", frommntpt, ZDIFF_SNAPDIR, ++strptr); if (di->isclone) free(frommntpt); return (0); } static int setup_differ_info(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, differ_info_t *di) { di->zhp = zhp; di->cleanupfd = open(ZFS_DEV, O_RDWR | O_CLOEXEC); VERIFY(di->cleanupfd >= 0); if (get_snapshot_names(di, fromsnap, tosnap) != 0) return (-1); if (get_mountpoints(di) != 0) return (-1); if (find_shares_object(di) != 0) return (-1); return (0); } int zfs_show_diffs(zfs_handle_t *zhp, int outfd, const char *fromsnap, const char *tosnap, int flags) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; differ_info_t di = { 0 }; pthread_t tid; int pipefd[2]; int iocerr; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "zfs diff failed")); if (setup_differ_info(zhp, fromsnap, tosnap, &di)) { teardown_differ_info(&di); return (-1); } if (pipe2(pipefd, O_CLOEXEC)) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(errno)); teardown_differ_info(&di); return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, errbuf)); } di.scripted = (flags & ZFS_DIFF_PARSEABLE); di.classify = (flags & ZFS_DIFF_CLASSIFY); di.timestamped = (flags & ZFS_DIFF_TIMESTAMP); di.no_mangle = (flags & ZFS_DIFF_NO_MANGLE); di.outputfd = outfd; di.datafd = pipefd[0]; if (pthread_create(&tid, NULL, differ, &di)) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(errno)); (void) close(pipefd[0]); (void) close(pipefd[1]); teardown_differ_info(&di); return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } /* do the ioctl() */ (void) strlcpy(zc.zc_value, di.fromsnap, strlen(di.fromsnap) + 1); (void) strlcpy(zc.zc_name, di.tosnap, strlen(di.tosnap) + 1); zc.zc_cookie = pipefd[1]; iocerr = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DIFF, &zc); if (iocerr != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "Unable to obtain diffs")); if (errno == EPERM) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "\n The sys_mount privilege or diff delegated " "permission is needed\n to execute the " "diff ioctl")); } else if (errno == EXDEV) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "\n Not an earlier snapshot from the same fs")); } else if (errno != EPIPE || di.zerr == 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(errno)); } (void) close(pipefd[1]); (void) pthread_cancel(tid); (void) pthread_join(tid, NULL); teardown_differ_info(&di); if (di.zerr != 0 && di.zerr != EPIPE) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(di.zerr)); return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf)); } else { return (zfs_error(zhp->zfs_hdl, EZFS_DIFFDATA, errbuf)); } } (void) close(pipefd[1]); (void) pthread_join(tid, NULL); if (di.zerr != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(di.zerr)); return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf)); } teardown_differ_info(&di); return (0); } diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c index 8d6f1c93d8c9..b1e71e998078 100644 --- a/lib/libzpool/taskq.c +++ b/lib/libzpool/taskq.c @@ -1,384 +1,384 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2012 Garrett D'Amore . All rights reserved. * Copyright (c) 2014 by Delphix. All rights reserved. */ #include int taskq_now; taskq_t *system_taskq; taskq_t *system_delay_taskq; static pthread_key_t taskq_tsd; #define TASKQ_ACTIVE 0x00010000 static taskq_ent_t * task_alloc(taskq_t *tq, int tqflags) { taskq_ent_t *t; int rv; again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) { ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); tq->tq_freelist = t->tqent_next; } else { if (tq->tq_nalloc >= tq->tq_maxalloc) { if (!(tqflags & KM_SLEEP)) return (NULL); /* * We don't want to exceed tq_maxalloc, but we can't * wait for other tasks to complete (and thus free up * task structures) without risking deadlock with * the caller. So, we just delay for one second * to throttle the allocation rate. If we have tasks * complete before one second timeout expires then * taskq_ent_free will signal us and we will * immediately retry the allocation. */ tq->tq_maxalloc_wait++; rv = cv_timedwait(&tq->tq_maxalloc_cv, &tq->tq_lock, ddi_get_lbolt() + hz); tq->tq_maxalloc_wait--; if (rv > 0) goto again; /* signaled */ } mutex_exit(&tq->tq_lock); t = kmem_alloc(sizeof (taskq_ent_t), tqflags); mutex_enter(&tq->tq_lock); if (t != NULL) { /* Make sure we start without any flags */ t->tqent_flags = 0; tq->tq_nalloc++; } } return (t); } static void task_free(taskq_t *tq, taskq_ent_t *t) { if (tq->tq_nalloc <= tq->tq_minalloc) { t->tqent_next = tq->tq_freelist; tq->tq_freelist = t; } else { tq->tq_nalloc--; mutex_exit(&tq->tq_lock); kmem_free(t, sizeof (taskq_ent_t)); mutex_enter(&tq->tq_lock); } if (tq->tq_maxalloc_wait) cv_signal(&tq->tq_maxalloc_cv); } taskqid_t taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags) { taskq_ent_t *t; if (taskq_now) { func(arg); return (1); } mutex_enter(&tq->tq_lock); ASSERT(tq->tq_flags & TASKQ_ACTIVE); if ((t = task_alloc(tq, tqflags)) == NULL) { mutex_exit(&tq->tq_lock); return (0); } if (tqflags & TQ_FRONT) { t->tqent_next = tq->tq_task.tqent_next; t->tqent_prev = &tq->tq_task; } else { t->tqent_next = &tq->tq_task; t->tqent_prev = tq->tq_task.tqent_prev; } t->tqent_next->tqent_prev = t; t->tqent_prev->tqent_next = t; t->tqent_func = func; t->tqent_arg = arg; t->tqent_flags = 0; cv_signal(&tq->tq_dispatch_cv); mutex_exit(&tq->tq_lock); return (1); } taskqid_t taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags, clock_t expire_time) { (void) tq, (void) func, (void) arg, (void) tqflags, (void) expire_time; return (0); } int taskq_empty_ent(taskq_ent_t *t) { return (t->tqent_next == NULL); } void taskq_init_ent(taskq_ent_t *t) { t->tqent_next = NULL; t->tqent_prev = NULL; t->tqent_func = NULL; t->tqent_arg = NULL; t->tqent_flags = 0; } void taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, taskq_ent_t *t) { ASSERT(func != NULL); /* * Mark it as a prealloc'd task. This is important * to ensure that we don't free it later. */ t->tqent_flags |= TQENT_FLAG_PREALLOC; /* * Enqueue the task to the underlying queue. */ mutex_enter(&tq->tq_lock); if (flags & TQ_FRONT) { t->tqent_next = tq->tq_task.tqent_next; t->tqent_prev = &tq->tq_task; } else { t->tqent_next = &tq->tq_task; t->tqent_prev = tq->tq_task.tqent_prev; } t->tqent_next->tqent_prev = t; t->tqent_prev->tqent_next = t; t->tqent_func = func; t->tqent_arg = arg; cv_signal(&tq->tq_dispatch_cv); mutex_exit(&tq->tq_lock); } void taskq_wait(taskq_t *tq) { mutex_enter(&tq->tq_lock); while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0) cv_wait(&tq->tq_wait_cv, &tq->tq_lock); mutex_exit(&tq->tq_lock); } void taskq_wait_id(taskq_t *tq, taskqid_t id) { (void) id; taskq_wait(tq); } void taskq_wait_outstanding(taskq_t *tq, taskqid_t id) { (void) id; taskq_wait(tq); } static __attribute__((noreturn)) void taskq_thread(void *arg) { taskq_t *tq = arg; taskq_ent_t *t; boolean_t prealloc; VERIFY0(pthread_setspecific(taskq_tsd, tq)); mutex_enter(&tq->tq_lock); while (tq->tq_flags & TASKQ_ACTIVE) { if ((t = tq->tq_task.tqent_next) == &tq->tq_task) { if (--tq->tq_active == 0) cv_broadcast(&tq->tq_wait_cv); cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock); tq->tq_active++; continue; } t->tqent_prev->tqent_next = t->tqent_next; t->tqent_next->tqent_prev = t->tqent_prev; t->tqent_next = NULL; t->tqent_prev = NULL; prealloc = t->tqent_flags & TQENT_FLAG_PREALLOC; mutex_exit(&tq->tq_lock); rw_enter(&tq->tq_threadlock, RW_READER); t->tqent_func(t->tqent_arg); rw_exit(&tq->tq_threadlock); mutex_enter(&tq->tq_lock); if (!prealloc) task_free(tq, t); } tq->tq_nthreads--; cv_broadcast(&tq->tq_wait_cv); mutex_exit(&tq->tq_lock); thread_exit(); } taskq_t * taskq_create(const char *name, int nthreads, pri_t pri, int minalloc, int maxalloc, uint_t flags) { (void) pri; taskq_t *tq = kmem_zalloc(sizeof (taskq_t), KM_SLEEP); int t; if (flags & TASKQ_THREADS_CPU_PCT) { int pct; ASSERT3S(nthreads, >=, 0); ASSERT3S(nthreads, <=, 100); pct = MIN(nthreads, 100); pct = MAX(pct, 0); nthreads = (sysconf(_SC_NPROCESSORS_ONLN) * pct) / 100; nthreads = MAX(nthreads, 1); /* need at least 1 thread */ } else { ASSERT3S(nthreads, >=, 1); } rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL); mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL); cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL); cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL); - (void) strncpy(tq->tq_name, name, TASKQ_NAMELEN); + (void) strlcpy(tq->tq_name, name, sizeof (tq->tq_name)); tq->tq_flags = flags | TASKQ_ACTIVE; tq->tq_active = nthreads; tq->tq_nthreads = nthreads; tq->tq_minalloc = minalloc; tq->tq_maxalloc = maxalloc; tq->tq_task.tqent_next = &tq->tq_task; tq->tq_task.tqent_prev = &tq->tq_task; tq->tq_threadlist = kmem_alloc(nthreads * sizeof (kthread_t *), KM_SLEEP); if (flags & TASKQ_PREPOPULATE) { mutex_enter(&tq->tq_lock); while (minalloc-- > 0) task_free(tq, task_alloc(tq, KM_SLEEP)); mutex_exit(&tq->tq_lock); } for (t = 0; t < nthreads; t++) VERIFY((tq->tq_threadlist[t] = thread_create(NULL, 0, taskq_thread, tq, 0, &p0, TS_RUN, pri)) != NULL); return (tq); } void taskq_destroy(taskq_t *tq) { int nthreads = tq->tq_nthreads; taskq_wait(tq); mutex_enter(&tq->tq_lock); tq->tq_flags &= ~TASKQ_ACTIVE; cv_broadcast(&tq->tq_dispatch_cv); while (tq->tq_nthreads != 0) cv_wait(&tq->tq_wait_cv, &tq->tq_lock); tq->tq_minalloc = 0; while (tq->tq_nalloc != 0) { ASSERT(tq->tq_freelist != NULL); task_free(tq, task_alloc(tq, KM_SLEEP)); } mutex_exit(&tq->tq_lock); kmem_free(tq->tq_threadlist, nthreads * sizeof (kthread_t *)); rw_destroy(&tq->tq_threadlock); mutex_destroy(&tq->tq_lock); cv_destroy(&tq->tq_dispatch_cv); cv_destroy(&tq->tq_wait_cv); cv_destroy(&tq->tq_maxalloc_cv); kmem_free(tq, sizeof (taskq_t)); } int taskq_member(taskq_t *tq, kthread_t *t) { int i; if (taskq_now) return (1); for (i = 0; i < tq->tq_nthreads; i++) if (tq->tq_threadlist[i] == t) return (1); return (0); } taskq_t * taskq_of_curthread(void) { return (pthread_getspecific(taskq_tsd)); } int taskq_cancel_id(taskq_t *tq, taskqid_t id) { (void) tq, (void) id; return (ENOENT); } void system_taskq_init(void) { VERIFY0(pthread_key_create(&taskq_tsd, NULL)); system_taskq = taskq_create("system_taskq", 64, maxclsyspri, 4, 512, TASKQ_DYNAMIC | TASKQ_PREPOPULATE); system_delay_taskq = taskq_create("delay_taskq", 4, maxclsyspri, 4, 512, TASKQ_DYNAMIC | TASKQ_PREPOPULATE); } void system_taskq_fini(void) { taskq_destroy(system_taskq); system_taskq = NULL; /* defensive */ taskq_destroy(system_delay_taskq); system_delay_taskq = NULL; VERIFY0(pthread_key_delete(taskq_tsd)); } diff --git a/module/os/freebsd/spl/callb.c b/module/os/freebsd/spl/callb.c index ba13ea887938..47f3ccc0c7fa 100644 --- a/module/os/freebsd/spl/callb.c +++ b/module/os/freebsd/spl/callb.c @@ -1,373 +1,372 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for delay() */ #include /* For TASKQ_NAMELEN */ #include #define CB_MAXNAME TASKQ_NAMELEN /* * The callb mechanism provides generic event scheduling/echoing. * A callb function is registered and called on behalf of the event. */ typedef struct callb { struct callb *c_next; /* next in class or on freelist */ kthread_id_t c_thread; /* ptr to caller's thread struct */ char c_flag; /* info about the callb state */ uchar_t c_class; /* this callb's class */ kcondvar_t c_done_cv; /* signal callb completion */ boolean_t (*c_func)(void *, int); /* cb function: returns true if ok */ void *c_arg; /* arg to c_func */ char c_name[CB_MAXNAME+1]; /* debug:max func name length */ } callb_t; /* * callb c_flag bitmap definitions */ #define CALLB_FREE 0x0 #define CALLB_TAKEN 0x1 #define CALLB_EXECUTING 0x2 /* * Basic structure for a callb table. * All callbs are organized into different class groups described * by ct_class array. * The callbs within a class are single-linked and normally run by a * serial execution. */ typedef struct callb_table { kmutex_t ct_lock; /* protect all callb states */ callb_t *ct_freelist; /* free callb structures */ boolean_t ct_busy; /* B_TRUE prevents additions */ kcondvar_t ct_busy_cv; /* to wait for not busy */ int ct_ncallb; /* num of callbs allocated */ callb_t *ct_first_cb[NCBCLASS]; /* ptr to 1st callb in a class */ } callb_table_t; int callb_timeout_sec = CPR_KTHREAD_TIMEOUT_SEC; static callb_id_t callb_add_common(boolean_t (*)(void *, int), void *, int, char *, kthread_id_t); static callb_table_t callb_table; /* system level callback table */ static callb_table_t *ct = &callb_table; static kmutex_t callb_safe_mutex; callb_cpr_t callb_cprinfo_safe = { &callb_safe_mutex, CALLB_CPR_ALWAYS_SAFE, 0, {0, 0} }; /* * Init all callb tables in the system. */ static void callb_init(void *dummy __unused) { callb_table.ct_busy = B_FALSE; /* mark table open for additions */ mutex_init(&callb_safe_mutex, NULL, MUTEX_DEFAULT, NULL); mutex_init(&callb_table.ct_lock, NULL, MUTEX_DEFAULT, NULL); } static void callb_fini(void *dummy __unused) { callb_t *cp; int i; mutex_enter(&ct->ct_lock); for (i = 0; i < 16; i++) { while ((cp = ct->ct_freelist) != NULL) { ct->ct_freelist = cp->c_next; ct->ct_ncallb--; kmem_free(cp, sizeof (callb_t)); } if (ct->ct_ncallb == 0) break; /* Not all callbacks finished, waiting for the rest. */ mutex_exit(&ct->ct_lock); tsleep(ct, 0, "callb", hz / 4); mutex_enter(&ct->ct_lock); } if (ct->ct_ncallb > 0) printf("%s: Leaked %d callbacks!\n", __func__, ct->ct_ncallb); mutex_exit(&ct->ct_lock); mutex_destroy(&callb_safe_mutex); mutex_destroy(&callb_table.ct_lock); } /* * callout_add() is called to register func() be called later. */ static callb_id_t callb_add_common(boolean_t (*func)(void *arg, int code), void *arg, int class, char *name, kthread_id_t t) { callb_t *cp; ASSERT3S(class, <, NCBCLASS); mutex_enter(&ct->ct_lock); while (ct->ct_busy) cv_wait(&ct->ct_busy_cv, &ct->ct_lock); if ((cp = ct->ct_freelist) == NULL) { ct->ct_ncallb++; cp = (callb_t *)kmem_zalloc(sizeof (callb_t), KM_SLEEP); } ct->ct_freelist = cp->c_next; cp->c_thread = t; cp->c_func = func; cp->c_arg = arg; cp->c_class = (uchar_t)class; cp->c_flag |= CALLB_TAKEN; #ifdef ZFS_DEBUG if (strlen(name) > CB_MAXNAME) cmn_err(CE_WARN, "callb_add: name of callback function '%s' " "too long -- truncated to %d chars", name, CB_MAXNAME); #endif - (void) strncpy(cp->c_name, name, CB_MAXNAME); - cp->c_name[CB_MAXNAME] = '\0'; + (void) strlcpy(cp->c_name, name, sizeof (cp->c_name)); /* * Insert the new callb at the head of its class list. */ cp->c_next = ct->ct_first_cb[class]; ct->ct_first_cb[class] = cp; mutex_exit(&ct->ct_lock); return ((callb_id_t)cp); } /* * The default function to add an entry to the callback table. Since * it uses curthread as the thread identifier to store in the table, * it should be used for the normal case of a thread which is calling * to add ITSELF to the table. */ callb_id_t callb_add(boolean_t (*func)(void *arg, int code), void *arg, int class, char *name) { return (callb_add_common(func, arg, class, name, curthread)); } /* * A special version of callb_add() above for use by threads which * might be adding an entry to the table on behalf of some other * thread (for example, one which is constructed but not yet running). * In this version the thread id is an argument. */ callb_id_t callb_add_thread(boolean_t (*func)(void *arg, int code), void *arg, int class, char *name, kthread_id_t t) { return (callb_add_common(func, arg, class, name, t)); } /* * callout_delete() is called to remove an entry identified by id * that was originally placed there by a call to callout_add(). * return -1 if fail to delete a callb entry otherwise return 0. */ int callb_delete(callb_id_t id) { callb_t **pp; callb_t *me = (callb_t *)id; mutex_enter(&ct->ct_lock); for (;;) { pp = &ct->ct_first_cb[me->c_class]; while (*pp != NULL && *pp != me) pp = &(*pp)->c_next; #ifdef ZFS_DEBUG if (*pp != me) { cmn_err(CE_WARN, "callb delete bogus entry 0x%p", (void *)me); mutex_exit(&ct->ct_lock); return (-1); } #endif /* DEBUG */ /* * It is not allowed to delete a callb in the middle of * executing otherwise, the callb_execute() will be confused. */ if (!(me->c_flag & CALLB_EXECUTING)) break; cv_wait(&me->c_done_cv, &ct->ct_lock); } /* relink the class list */ *pp = me->c_next; /* clean up myself and return the free callb to the head of freelist */ me->c_flag = CALLB_FREE; me->c_next = ct->ct_freelist; ct->ct_freelist = me; mutex_exit(&ct->ct_lock); return (0); } /* * class: indicates to execute all callbs in the same class; * code: optional argument for the callb functions. * return: = 0: success * != 0: ptr to string supplied when callback was registered */ void * callb_execute_class(int class, int code) { callb_t *cp; void *ret = NULL; ASSERT3S(class, <, NCBCLASS); mutex_enter(&ct->ct_lock); for (cp = ct->ct_first_cb[class]; cp != NULL && ret == 0; cp = cp->c_next) { while (cp->c_flag & CALLB_EXECUTING) cv_wait(&cp->c_done_cv, &ct->ct_lock); /* * cont if the callb is deleted while we're sleeping */ if (cp->c_flag == CALLB_FREE) continue; cp->c_flag |= CALLB_EXECUTING; #ifdef CALLB_DEBUG printf("callb_execute: name=%s func=%p arg=%p\n", cp->c_name, (void *)cp->c_func, (void *)cp->c_arg); #endif /* CALLB_DEBUG */ mutex_exit(&ct->ct_lock); /* If callback function fails, pass back client's name */ if (!(*cp->c_func)(cp->c_arg, code)) ret = cp->c_name; mutex_enter(&ct->ct_lock); cp->c_flag &= ~CALLB_EXECUTING; cv_broadcast(&cp->c_done_cv); } mutex_exit(&ct->ct_lock); return (ret); } /* * callers make sure no recursive entries to this func. * dp->cc_lockp is registered by callb_add to protect callb_cpr_t structure. * * When calling to stop a kernel thread (code == CB_CODE_CPR_CHKPT) we * use a cv_timedwait() in case the kernel thread is blocked. * * Note that this is a generic callback handler for daemon CPR and * should NOT be changed to accommodate any specific requirement in a daemon. * Individual daemons that require changes to the handler shall write * callback routines in their own daemon modules. */ boolean_t callb_generic_cpr(void *arg, int code) { callb_cpr_t *cp = (callb_cpr_t *)arg; clock_t ret = 0; /* assume success */ mutex_enter(cp->cc_lockp); switch (code) { case CB_CODE_CPR_CHKPT: cp->cc_events |= CALLB_CPR_START; #ifdef CPR_NOT_THREAD_SAFE while (!(cp->cc_events & CALLB_CPR_SAFE)) /* cv_timedwait() returns -1 if it times out. */ if ((ret = cv_reltimedwait(&cp->cc_callb_cv, cp->cc_lockp, (callb_timeout_sec * hz), TR_CLOCK_TICK)) == -1) break; #endif break; case CB_CODE_CPR_RESUME: cp->cc_events &= ~CALLB_CPR_START; cv_signal(&cp->cc_stop_cv); break; } mutex_exit(cp->cc_lockp); return (ret != -1); } /* * The generic callback function associated with kernel threads which * are always considered safe. */ boolean_t callb_generic_cpr_safe(void *arg, int code) { (void) arg, (void) code; return (B_TRUE); } /* * Prevent additions to callback table. */ void callb_lock_table(void) { mutex_enter(&ct->ct_lock); ASSERT(!ct->ct_busy); ct->ct_busy = B_TRUE; mutex_exit(&ct->ct_lock); } /* * Allow additions to callback table. */ void callb_unlock_table(void) { mutex_enter(&ct->ct_lock); ASSERT(ct->ct_busy); ct->ct_busy = B_FALSE; cv_broadcast(&ct->ct_busy_cv); mutex_exit(&ct->ct_lock); } SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL); SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL); diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index b290c36748ca..c65be4c134d5 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -1,2318 +1,2317 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_comutil.h" #ifndef MNTK_VMSETSIZE_BUG #define MNTK_VMSETSIZE_BUG 0 #endif #ifndef MNTK_NOMSYNC #define MNTK_NOMSYNC 8 #endif struct mtx zfs_debug_mtx; MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); int zfs_super_owner; SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, "File system owners can perform privileged operation on file systems"); int zfs_debug_level; SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, "Debug level"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); static int zfs_version_acl = ZFS_ACL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, "ZFS_ACL_VERSION"); static int zfs_version_spa = SPA_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, "SPA_VERSION"); static int zfs_version_zpl = ZPL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, "ZPL_VERSION"); #if __FreeBSD_version >= 1400018 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy); #else static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); #endif static int zfs_mount(vfs_t *vfsp); static int zfs_umount(vfs_t *vfsp, int fflag); static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); static int zfs_sync(vfs_t *vfsp, int waitfor); #if __FreeBSD_version >= 1300098 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, struct ucred **credanonp, int *numsecflavors, int *secflavors); #else static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors); #endif static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); static void zfs_freevfs(vfs_t *vfsp); struct vfsops zfs_vfsops = { .vfs_mount = zfs_mount, .vfs_unmount = zfs_umount, #if __FreeBSD_version >= 1300049 .vfs_root = vfs_cache_root, .vfs_cachedroot = zfs_root, #else .vfs_root = zfs_root, #endif .vfs_statfs = zfs_statfs, .vfs_vget = zfs_vget, .vfs_sync = zfs_sync, .vfs_checkexp = zfs_checkexp, .vfs_fhtovp = zfs_fhtovp, .vfs_quotactl = zfs_quotactl, }; VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); /* * We need to keep a count of active fs's. * This is necessary to prevent our module * from being unloaded after a umount -f */ static uint32_t zfs_active_fs_count = 0; int zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint) { int error; zfsvfs_t *zfvp; vfs_t *vfsp; objset_t *os; uint64_t tmp = *val; error = dmu_objset_from_ds(ds, &os); if (error != 0) return (error); error = getzfsvfs_impl(os, &zfvp); if (error != 0) return (error); if (zfvp == NULL) return (ENOENT); vfsp = zfvp->z_vfs; switch (zfs_prop) { case ZFS_PROP_ATIME: if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) tmp = 1; break; case ZFS_PROP_DEVICES: if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) tmp = 1; break; case ZFS_PROP_EXEC: if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) tmp = 1; break; case ZFS_PROP_SETUID: if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) tmp = 1; break; case ZFS_PROP_READONLY: if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) tmp = 1; break; case ZFS_PROP_XATTR: if (zfvp->z_flags & ZSB_XATTR) tmp = zfvp->z_xattr; break; case ZFS_PROP_NBMAND: if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) tmp = 1; break; default: vfs_unbusy(vfsp); return (ENOENT); } vfs_unbusy(vfsp); if (tmp != *val) { (void) strcpy(setpoint, "temporary"); *val = tmp; } return (0); } static int zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) { int error = 0; char buf[32]; uint64_t usedobj, quotaobj; uint64_t quota, used = 0; timespec_t now; usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; if (quotaobj == 0 || zfsvfs->z_replay) { error = ENOENT; goto done; } (void) sprintf(buf, "%llx", (longlong_t)id); if ((error = zap_lookup(zfsvfs->z_os, quotaobj, buf, sizeof (quota), 1, "a)) != 0) { dprintf("%s(%d): quotaobj lookup failed\n", __FUNCTION__, __LINE__); goto done; } /* * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". * So we set them to be the same. */ dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used); if (error && error != ENOENT) { dprintf("%s(%d): usedobj failed; %d\n", __FUNCTION__, __LINE__, error); goto done; } dqp->dqb_curblocks = btodb(used); dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; vfs_timestamp(&now); /* * Setting this to 0 causes FreeBSD quota(8) to print * the number of days since the epoch, which isn't * particularly useful. */ dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; done: return (error); } static int #if __FreeBSD_version >= 1400018 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy) #else zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) #endif { zfsvfs_t *zfsvfs = vfsp->vfs_data; struct thread *td; int cmd, type, error = 0; int bitsize; zfs_userquota_prop_t quota_type; struct dqblk64 dqblk = { 0 }; td = curthread; cmd = cmds >> SUBCMDSHIFT; type = cmds & SUBCMDMASK; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); if (id == -1) { switch (type) { case USRQUOTA: id = td->td_ucred->cr_ruid; break; case GRPQUOTA: id = td->td_ucred->cr_rgid; break; default: error = EINVAL; #if __FreeBSD_version < 1400018 if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) vfs_unbusy(vfsp); #endif goto done; } } /* * Map BSD type to: * ZFS_PROP_USERUSED, * ZFS_PROP_USERQUOTA, * ZFS_PROP_GROUPUSED, * ZFS_PROP_GROUPQUOTA */ switch (cmd) { case Q_SETQUOTA: case Q_SETQUOTA32: if (type == USRQUOTA) quota_type = ZFS_PROP_USERQUOTA; else if (type == GRPQUOTA) quota_type = ZFS_PROP_GROUPQUOTA; else error = EINVAL; break; case Q_GETQUOTA: case Q_GETQUOTA32: if (type == USRQUOTA) quota_type = ZFS_PROP_USERUSED; else if (type == GRPQUOTA) quota_type = ZFS_PROP_GROUPUSED; else error = EINVAL; break; } /* * Depending on the cmd, we may need to get * the ruid and domain (see fuidstr_to_sid?), * the fuid (how?), or other information. * Create fuid using zfs_fuid_create(zfsvfs, id, * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)? * I think I can use just the id? * * Look at zfs_id_overquota() to look up a quota. * zap_lookup(something, quotaobj, fuidstring, * sizeof (long long), 1, "a) * * See zfs_set_userquota() to set a quota. */ if ((uint32_t)type >= MAXQUOTAS) { error = EINVAL; goto done; } switch (cmd) { case Q_GETQUOTASIZE: bitsize = 64; error = copyout(&bitsize, arg, sizeof (int)); break; case Q_QUOTAON: // As far as I can tell, you can't turn quotas on or off on zfs error = 0; #if __FreeBSD_version < 1400018 vfs_unbusy(vfsp); #endif break; case Q_QUOTAOFF: error = ENOTSUP; #if __FreeBSD_version < 1400018 vfs_unbusy(vfsp); #endif break; case Q_SETQUOTA: error = copyin(arg, &dqblk, sizeof (dqblk)); if (error == 0) error = zfs_set_userquota(zfsvfs, quota_type, "", id, dbtob(dqblk.dqb_bhardlimit)); break; case Q_GETQUOTA: error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk); if (error == 0) error = copyout(&dqblk, arg, sizeof (dqblk)); break; default: error = EINVAL; break; } done: zfs_exit(zfsvfs, FTAG); return (error); } boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs) { return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)); } static int zfs_sync(vfs_t *vfsp, int waitfor) { /* * Data integrity is job one. We don't want a compromised kernel * writing to the storage pool, so we never sync during panic. */ if (panicstr) return (0); /* * Ignore the system syncher. ZFS already commits async data * at zfs_txg_timeout intervals. */ if (waitfor == MNT_LAZY) return (0); if (vfsp != NULL) { /* * Sync a specific filesystem. */ zfsvfs_t *zfsvfs = vfsp->vfs_data; dsl_pool_t *dp; int error; error = vfs_stdsync(vfsp, waitfor); if (error != 0) return (error); if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); dp = dmu_objset_pool(zfsvfs->z_os); /* * If the system is shutting down, then skip any * filesystems which may exist on a suspended pool. */ if (rebooting && spa_suspended(dp->dp_spa)) { zfs_exit(zfsvfs, FTAG); return (0); } if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, 0); zfs_exit(zfsvfs, FTAG); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(8). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); } static void atime_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { zfsvfs->z_atime = TRUE; zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); } else { zfsvfs->z_atime = FALSE; zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); } } static void xattr_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == ZFS_XATTR_OFF) { zfsvfs->z_flags &= ~ZSB_XATTR; } else { zfsvfs->z_flags |= ZSB_XATTR; if (newval == ZFS_XATTR_SA) zfsvfs->z_xattr_sa = B_TRUE; else zfsvfs->z_xattr_sa = B_FALSE; } } static void blksz_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); ASSERT(ISP2(newval)); zfsvfs->z_max_blksz = newval; zfsvfs->z_vfs->mnt_stat.f_iosize = newval; } static void readonly_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval) { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); } else { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); } } static void setuid_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); } } static void exec_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); } } /* * The nbmand mount option can be changed at mount time. * We can't allow it to be toggled on live file systems or incorrect * behavior may be seen from cifs clients * * This property isn't registered via dsl_prop_register(), but this callback * will be called when a file system is first mounted */ static void nbmand_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); } else { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); } } static void snapdir_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_show_ctldir = newval; } static void acl_mode_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_mode = newval; } static void acl_inherit_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_inherit = newval; } static void acl_type_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_type = newval; } static int zfs_register_callbacks(vfs_t *vfsp) { struct dsl_dataset *ds = NULL; objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; uint64_t nbmand; boolean_t readonly = B_FALSE; boolean_t do_readonly = B_FALSE; boolean_t setuid = B_FALSE; boolean_t do_setuid = B_FALSE; boolean_t exec = B_FALSE; boolean_t do_exec = B_FALSE; boolean_t xattr = B_FALSE; boolean_t atime = B_FALSE; boolean_t do_atime = B_FALSE; boolean_t do_xattr = B_FALSE; int error = 0; ASSERT3P(vfsp, !=, NULL); zfsvfs = vfsp->vfs_data; ASSERT3P(zfsvfs, !=, NULL); os = zfsvfs->z_os; /* * This function can be called for a snapshot when we update snapshot's * mount point, which isn't really supported. */ if (dmu_objset_is_snapshot(os)) return (EOPNOTSUPP); /* * The act of registering our callbacks will destroy any mount * options we may have. In order to enable temporary overrides * of mount options, we stash away the current values and * restore them after we register the callbacks. */ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || !spa_writeable(dmu_objset_spa(os))) { readonly = B_TRUE; do_readonly = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { readonly = B_FALSE; do_readonly = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { setuid = B_TRUE; do_setuid = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { exec = B_FALSE; do_exec = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { exec = B_TRUE; do_exec = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_SA; do_xattr = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { atime = B_FALSE; do_atime = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { atime = B_TRUE; do_atime = B_TRUE; } /* * We need to enter pool configuration here, so that we can use * dsl_prop_get_int_ds() to handle the special nbmand property below. * dsl_prop_get_integer() can not be used, because it has to acquire * spa_namespace_lock and we can not do that because we already hold * z_teardown_lock. The problem is that spa_write_cachefile() is called * with spa_namespace_lock held and the function calls ZFS vnode * operations to write the cache file and thus z_teardown_lock is * acquired after spa_namespace_lock. */ ds = dmu_objset_ds(os); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); /* * nbmand is a special property. It can only be changed at * mount time. * * This is weird, but it is documented to only be changeable * at mount time. */ if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { nbmand = B_FALSE; } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { nbmand = B_TRUE; } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0)) { dsl_pool_config_exit(dmu_objset_pool(os), FTAG); return (error); } /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ error = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zfsvfs); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; /* * Invoke our callbacks to restore temporary mount options. */ if (do_readonly) readonly_changed_cb(zfsvfs, readonly); if (do_setuid) setuid_changed_cb(zfsvfs, setuid); if (do_exec) exec_changed_cb(zfsvfs, exec); if (do_xattr) xattr_changed_cb(zfsvfs, xattr); if (do_atime) atime_changed_cb(zfsvfs, atime); nbmand_changed_cb(zfsvfs, nbmand); return (0); unregister: dsl_prop_unregister_all(ds, zfsvfs); return (error); } /* * Associate this zfsvfs with the given objset, which must be owned. * This will cache a bunch of on-disk state from the objset in the * zfsvfs. */ static int zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) { int error; uint64_t val; zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; zfsvfs->z_os = os; error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); if (error != 0) return (error); if (zfsvfs->z_version > zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { (void) printf("Can't mount a version %lld file system " "on a version %lld pool\n. Pool must be upgraded to mount " "this file system.", (u_longlong_t)zfsvfs->z_version, (u_longlong_t)spa_version(dmu_objset_spa(os))); return (SET_ERROR(ENOTSUP)); } error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); if (error != 0) return (error); zfsvfs->z_norm = (int)val; error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); if (error != 0) return (error); zfsvfs->z_utf8 = (val != 0); error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); if (error != 0) return (error); zfsvfs->z_case = (uint_t)val; error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val); if (error != 0) return (error); zfsvfs->z_acl_type = (uint_t)val; /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || zfsvfs->z_case == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); uint64_t sa_obj = 0; if (zfsvfs->z_use_sa) { /* should either have both of these objects or none */ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0) return (error); error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val); if (error == 0 && val == ZFS_XATTR_SA) zfsvfs->z_xattr_sa = B_TRUE; } error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); if (error != 0) return (error); if (zfsvfs->z_version >= ZPL_VERSION_SA) sa_register_update_callback(os, zfs_sa_upgrade); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); if (error != 0) return (error); ASSERT3U(zfsvfs->z_root, !=, 0); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &zfsvfs->z_unlinkedobj); if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 8, 1, &zfsvfs->z_userquota_obj); if (error == ENOENT) zfsvfs->z_userquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 8, 1, &zfsvfs->z_groupquota_obj); if (error == ENOENT) zfsvfs->z_groupquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], 8, 1, &zfsvfs->z_projectquota_obj); if (error == ENOENT) zfsvfs->z_projectquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], 8, 1, &zfsvfs->z_userobjquota_obj); if (error == ENOENT) zfsvfs->z_userobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], 8, 1, &zfsvfs->z_groupobjquota_obj); if (error == ENOENT) zfsvfs->z_groupobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], 8, 1, &zfsvfs->z_projectobjquota_obj); if (error == ENOENT) zfsvfs->z_projectobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); if (error == ENOENT) zfsvfs->z_fuid_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &zfsvfs->z_shares_dir); if (error == ENOENT) zfsvfs->z_shares_dir = 0; else if (error != 0) return (error); /* * Only use the name cache if we are looking for a * name on a file system that does not require normalization * or case folding. We can also look there if we happen to be * on a non-normalizing, mixed sensitivity file system IF we * are looking for the exact name (which is always the case on * FreeBSD). */ zfsvfs->z_use_namecache = !zfsvfs->z_norm || ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); return (0); } taskq_t *zfsvfs_taskq; static void zfsvfs_task_unlinked_drain(void *context, int pending __unused) { zfs_unlinked_drain((zfsvfs_t *)context); } int zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) { objset_t *os; zfsvfs_t *zfsvfs; int error; boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); /* * XXX: Fix struct statfs so this isn't necessary! * * The 'osname' is used as the filesystem's special node, which means * it must fit in statfs.f_mntfromname, or else it can't be * enumerated, so libzfs_mnttab_find() returns NULL, which causes * 'zfs unmount' to think it's not mounted when it is. */ if (strlen(osname) >= MNAMELEN) return (SET_ERROR(ENAMETOOLONG)); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os); if (error != 0) { kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } error = zfsvfs_create_impl(zfvp, zfsvfs, os); return (error); } int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) { int error; zfsvfs->z_vfs = NULL; zfsvfs->z_parent = zfsvfs; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0, zfsvfs_task_unlinked_drain, zfsvfs); ZFS_TEARDOWN_INIT(zfsvfs); ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); error = zfsvfs_init(zfsvfs, os); if (error != 0) { dmu_objset_disown(os, B_TRUE, zfsvfs); *zfvp = NULL; kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } *zfvp = zfsvfs; return (0); } static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) { int error; /* * Check for a bad on-disk format version now since we * lied about owning the dataset readonly before. */ if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && dmu_objset_incompatible_encryption_version(zfsvfs->z_os)) return (SET_ERROR(EROFS)); error = zfs_register_callbacks(zfsvfs->z_vfs); if (error) return (error); /* * If we are not mounting (ie: online recv), then we don't * have to worry about replaying the log as we blocked all * operations out since we closed the ZIL. */ if (mounting) { boolean_t readonly; ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); if (error) return (error); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, &zfsvfs->z_kstat.dk_zil_sums); /* * During replay we remove the read only flag to * allow replays to succeed. */ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; if (readonly != 0) { zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; } else { dsl_dir_t *dd; zap_stats_t zs; if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, &zs) == 0) { dataset_kstats_update_nunlinks_kstat( &zfsvfs->z_kstat, zs.zs_num_entries); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, "num_entries in unlinked set: %llu", (u_longlong_t)zs.zs_num_entries); } zfs_unlinked_drain(zfsvfs); dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; dd->dd_activity_cancelled = B_FALSE; } /* * Parse and replay the intent log. * * Because of ziltest, this must be done after * zfs_unlinked_drain(). (Further note: ziltest * doesn't use readonly mounts, where * zfs_unlinked_drain() isn't called.) This is because * ziltest causes spa_sync() to think it's committed, * but actually it is not, so the intent log contains * many txg's worth of changes. * * In particular, if object N is in the unlinked set in * the last txg to actually sync, then it could be * actually freed in a later txg and then reallocated * in a yet later txg. This would write a "create * object N" record to the intent log. Normally, this * would be fine because the spa_sync() would have * written out the fact that object N is free, before * we could write the "create object N" intent log * record. * * But when we are in ziltest mode, we advance the "open * txg" without actually spa_sync()-ing the changes to * disk. So we would see that object N is still * allocated and in the unlinked set, and there is an * intent log record saying to allocate it. */ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { if (zil_replay_disable) { zil_destroy(zfsvfs->z_log, B_FALSE); } else { boolean_t use_nc = zfsvfs->z_use_namecache; zfsvfs->z_use_namecache = B_FALSE; zfsvfs->z_replay = B_TRUE; zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); zfsvfs->z_replay = B_FALSE; zfsvfs->z_use_namecache = use_nc; } } /* restore readonly bit */ if (readonly != 0) zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; } else { ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, &zfsvfs->z_kstat.dk_zil_sums); } /* * Set the objset user_ptr to track its zfsvfs. */ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); return (0); } void zfsvfs_free(zfsvfs_t *zfsvfs) { int i; zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); ASSERT3U(zfsvfs->z_nr_znodes, ==, 0); list_destroy(&zfsvfs->z_all_znodes); ZFS_TEARDOWN_DESTROY(zfsvfs); ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs); rw_destroy(&zfsvfs->z_fuid_lock); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); dataset_kstats_destroy(&zfsvfs->z_kstat); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) { zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } static int zfs_domount(vfs_t *vfsp, char *osname) { uint64_t recordsize, fsid_guid; int error = 0; zfsvfs_t *zfsvfs; ASSERT3P(vfsp, !=, NULL); ASSERT3P(osname, !=, NULL); error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs); if (error) return (error); zfsvfs->z_vfs = vfsp; if ((error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL))) goto out; zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; vfsp->vfs_data = zfsvfs; vfsp->mnt_flag |= MNT_LOCAL; vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; /* * This can cause a loss of coherence between ARC and page cache * on ZoF - unclear if the problem is in FreeBSD or ZoF */ vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ vfsp->mnt_kern_flag |= MNTK_NOMSYNC; vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG; #if defined(_KERNEL) && !defined(KMEM_DEBUG) vfsp->mnt_kern_flag |= MNTK_FPLOOKUP; #endif /* * The fsid is 64 bits, composed of an 8-bit fs type, which * separates our fsid from any other filesystem types, and a * 56-bit objset unique ID. The objset unique ID is unique to * all objsets open on this system, provided by unique_create(). * The 8-bit fs type must be put in the low bits of fsid[1] * because that's where other Solaris filesystems put it. */ fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0); vfsp->vfs_fsid.val[0] = fsid_guid; vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) | (vfsp->mnt_vfc->vfc_typenum & 0xFF); /* * Set features for file system. */ zfs_set_fuid_feature(zfsvfs); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if ((error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))) goto out; xattr_changed_cb(zfsvfs, pval); if ((error = dsl_prop_get_integer(osname, "acltype", &pval, NULL))) goto out; acl_type_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); } else { if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) goto out; } vfs_mountedfrom(vfsp, osname); if (!zfsvfs->z_issnap) zfsctl_create(zfsvfs); out: if (error) { dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } else { atomic_inc_32(&zfs_active_fs_count); } return (error); } static void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) { objset_t *os = zfsvfs->z_os; if (!dmu_objset_is_snapshot(os)) dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); } static int getpoolname(const char *osname, char *poolname) { char *p; p = strchr(osname, '/'); if (p == NULL) { if (strlen(osname) >= MAXNAMELEN) return (ENAMETOOLONG); (void) strcpy(poolname, osname); } else { if (p - osname >= MAXNAMELEN) return (ENAMETOOLONG); - (void) strncpy(poolname, osname, p - osname); - poolname[p - osname] = '\0'; + (void) strlcpy(poolname, osname, p - osname + 1); } return (0); } static void fetch_osname_options(char *name, bool *checkpointrewind) { if (name[0] == '!') { *checkpointrewind = true; memmove(name, name + 1, strlen(name)); } else { *checkpointrewind = false; } } static int zfs_mount(vfs_t *vfsp) { kthread_t *td = curthread; vnode_t *mvp = vfsp->mnt_vnodecovered; cred_t *cr = td->td_ucred; char *osname; int error = 0; int canwrite; bool checkpointrewind; if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) return (SET_ERROR(EINVAL)); /* * If full-owner-access is enabled and delegated administration is * turned on, we must set nosuid. */ if (zfs_super_owner && dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { secpolicy_fs_mount_clearopts(cr, vfsp); } fetch_osname_options(osname, &checkpointrewind); /* * Check for mount privilege? * * If we don't have privilege then see if * we have local permission to allow it */ error = secpolicy_fs_mount(cr, mvp, vfsp); if (error) { if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) goto out; if (!(vfsp->vfs_flag & MS_REMOUNT)) { vattr_t vattr; /* * Make sure user is the owner of the mount point * or has sufficient privileges. */ vattr.va_mask = AT_UID; vn_lock(mvp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(mvp, &vattr, cr)) { VOP_UNLOCK1(mvp); goto out; } if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { VOP_UNLOCK1(mvp); goto out; } VOP_UNLOCK1(mvp); } secpolicy_fs_mount_clearopts(cr, vfsp); } /* * Refuse to mount a filesystem if we are in a local zone and the * dataset is not visible. */ if (!INGLOBALZONE(curproc) && (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { error = SET_ERROR(EPERM); goto out; } vfsp->vfs_flag |= MNT_NFS4ACLS; /* * When doing a remount, we simply refresh our temporary properties * according to those options set in the current VFS options. */ if (vfsp->vfs_flag & MS_REMOUNT) { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * Refresh mount options with z_teardown_lock blocking I/O while * the filesystem is in an inconsistent state. * The lock also serializes this code with filesystem * manipulations between entry to zfs_suspend_fs() and return * from zfs_resume_fs(). */ ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); zfs_unregister_callbacks(zfsvfs); error = zfs_register_callbacks(vfsp); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); goto out; } /* Initial root mount: try hard to import the requested root pool. */ if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && (vfsp->vfs_flag & MNT_UPDATE) == 0) { char pname[MAXNAMELEN]; error = getpoolname(osname, pname); if (error == 0) error = spa_import_rootpool(pname, checkpointrewind); if (error) goto out; } DROP_GIANT(); error = zfs_domount(vfsp, osname); PICKUP_GIANT(); out: return (error); } static int zfs_statfs(vfs_t *vfsp, struct statfs *statp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; uint64_t refdbytes, availbytes, usedobjs, availobjs; int error; statp->f_version = STATFS_VERSION; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); /* * The underlying storage pool actually uses multiple block sizes. * We report the fragsize as the smallest block size we support, * and we report our blocksize as the filesystem's maximum blocksize. */ statp->f_bsize = SPA_MINBLOCKSIZE; statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; /* * The following report "total" blocks of various kinds in the * file system, but reported in terms of f_frsize - the * "fragment" size. */ statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; statp->f_bfree = availbytes / statp->f_bsize; statp->f_bavail = statp->f_bfree; /* no root reservation */ /* * statvfs() should really be called statufs(), because it assumes * static metadata. ZFS doesn't preallocate files, so the best * we can do is report the max that could possibly fit in f_files, * and that minus the number actually used in f_ffree. * For f_ffree, report the smaller of the number of object available * and the number of blocks (each object will take at least a block). */ statp->f_ffree = MIN(availobjs, statp->f_bfree); statp->f_files = statp->f_ffree + usedobjs; /* * We're a zfs filesystem. */ strlcpy(statp->f_fstypename, "zfs", sizeof (statp->f_fstypename)); strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, sizeof (statp->f_mntfromname)); strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, sizeof (statp->f_mntonname)); statp->f_namemax = MAXNAMELEN - 1; zfs_exit(zfsvfs, FTAG); return (0); } static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *rootzp; int error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *vpp = ZTOV(rootzp); zfs_exit(zfsvfs, FTAG); if (error == 0) { error = vn_lock(*vpp, flags); if (error != 0) { VN_RELE(*vpp); *vpp = NULL; } } return (error); } /* * Teardown the zfsvfs::z_os. * * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' * and 'z_teardown_inactive_lock' held. */ static int zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) { znode_t *zp; dsl_dir_t *dd; /* * If someone has not already unmounted this file system, * drain the zrele_taskq to ensure all active references to the * zfsvfs_t have been handled only then can it be safely destroyed. */ if (zfsvfs->z_os) { /* * If we're unmounting we have to wait for the list to * drain completely. * * If we're not unmounting there's no guarantee the list * will drain completely, but zreles run from the taskq * may add the parents of dir-based xattrs to the taskq * so we want to wait for these. * * We can safely read z_nr_znodes without locking because the * VFS has already blocked operations which add to the * z_all_znodes list and thus increment z_nr_znodes. */ int round = 0; while (zfsvfs->z_nr_znodes > 0) { taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); if (++round > 1 && !unmounting) break; } } ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); if (!unmounting) { /* * We purge the parent filesystem's vfsp as the parent * filesystem and all of its snapshots have their vnode's * v_vfsp set to the parent's filesystem's vfsp. Note, * 'z_parent' is self referential for non-snapshots. */ #ifdef FREEBSD_NAMECACHE #if __FreeBSD_version >= 1300117 cache_purgevfs(zfsvfs->z_parent->z_vfs); #else cache_purgevfs(zfsvfs->z_parent->z_vfs, true); #endif #endif } /* * Close the zil. NB: Can't close the zil while zfs_inactive * threads are blocked as zil_close can call zfs_inactive. */ if (zfsvfs->z_log) { zil_close(zfsvfs->z_log); zfsvfs->z_log = NULL; } ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs); /* * If we are not unmounting (ie: online recv) and someone already * unmounted this file system while we were doing the switcheroo, * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); return (SET_ERROR(EIO)); } /* * At this point there are no vops active, and any new vops will * fail with EIO since we have z_teardown_lock for writer (only * relevant for forced unmount). * * Release all holds on dbufs. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) { if (zp->z_sa_hdl != NULL) { zfs_znode_dmu_fini(zp); } } mutex_exit(&zfsvfs->z_znodes_lock); /* * If we are unmounting, set the unmounted flag and let new vops * unblock. zfs_inactive will have the unmounted behavior, and all * other vops will fail with EIO. */ if (unmounting) { zfsvfs->z_unmounted = B_TRUE; ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); } /* * z_os will be NULL if there was an error in attempting to reopen * zfsvfs, so just return as the properties had already been * unregistered and cached data had been evicted before. */ if (zfsvfs->z_os == NULL) return (0); /* * Unregister properties. */ zfs_unregister_callbacks(zfsvfs); /* * Evict cached data */ if (!zfs_is_readonly(zfsvfs)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); dmu_objset_evict_dbufs(zfsvfs->z_os); dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; dsl_dir_cancel_waiters(dd); return (0); } static int zfs_umount(vfs_t *vfsp, int fflag) { kthread_t *td = curthread; zfsvfs_t *zfsvfs = vfsp->vfs_data; objset_t *os; cred_t *cr = td->td_ucred; int ret; ret = secpolicy_fs_unmount(cr, vfsp); if (ret) { if (dsl_deleg_access((char *)vfsp->vfs_resource, ZFS_DELEG_PERM_MOUNT, cr)) return (ret); } /* * Unmount any snapshots mounted under .zfs before unmounting the * dataset itself. */ if (zfsvfs->z_ctldir != NULL) { if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) return (ret); } if (fflag & MS_FORCE) { /* * Mark file system as unmounted before calling * vflush(FORCECLOSE). This way we ensure no future vnops * will be called and risk operating on DOOMED vnodes. */ ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); zfsvfs->z_unmounted = B_TRUE; ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); } /* * Flush all the files. */ ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); if (ret != 0) return (ret); while (taskqueue_cancel(zfsvfs_taskq->tq_queue, &zfsvfs->z_unlinked_drain_task, NULL) != 0) taskqueue_drain(zfsvfs_taskq->tq_queue, &zfsvfs->z_unlinked_drain_task); VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE)); os = zfsvfs->z_os; /* * z_os will be NULL if there was an error in * attempting to reopen zfsvfs. */ if (os != NULL) { /* * Unset the objset user_ptr. */ mutex_enter(&os->os_user_ptr_lock); dmu_objset_set_user(os, NULL); mutex_exit(&os->os_user_ptr_lock); /* * Finally release the objset */ dmu_objset_disown(os, B_TRUE, zfsvfs); } /* * We can now safely destroy the '.zfs' directory node. */ if (zfsvfs->z_ctldir != NULL) zfsctl_destroy(zfsvfs); zfs_freevfs(vfsp); return (0); } static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; int err; /* * zfs_zget() can't operate on virtual entries like .zfs/ or * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. * This will make NFS to switch to LOOKUP instead of using VGET. */ if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) return (EOPNOTSUPP); if ((err = zfs_enter(zfsvfs, FTAG)) != 0) return (err); err = zfs_zget(zfsvfs, ino, &zp); if (err == 0 && zp->z_unlinked) { vrele(ZTOV(zp)); err = EINVAL; } if (err == 0) *vpp = ZTOV(zp); zfs_exit(zfsvfs, FTAG); if (err == 0) { err = vn_lock(*vpp, flags); if (err != 0) vrele(*vpp); } if (err != 0) *vpp = NULL; return (err); } static int #if __FreeBSD_version >= 1300098 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, struct ucred **credanonp, int *numsecflavors, int *secflavors) #else zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors) #endif { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * If this is regular file system vfsp is the same as * zfsvfs->z_parent->z_vfs, but if it is snapshot, * zfsvfs->z_parent->z_vfs represents parent file system * which we have to use here, because only this file system * has mnt_export configured. */ return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, credanonp, numsecflavors, secflavors)); } _Static_assert(sizeof (struct fid) >= SHORT_FID_LEN, "struct fid bigger than SHORT_FID_LEN"); _Static_assert(sizeof (struct fid) >= LONG_FID_LEN, "struct fid bigger than LONG_FID_LEN"); static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) { struct componentname cn; zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; vnode_t *dvp; uint64_t object = 0; uint64_t fid_gen = 0; uint64_t setgen = 0; uint64_t gen_mask; uint64_t zp_gen; int i, err; *vpp = NULL; if ((err = zfs_enter(zfsvfs, FTAG)) != 0) return (err); /* * On FreeBSD we can get snapshot's mount point or its parent file * system mount point depending if snapshot is already mounted or not. */ if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { zfid_long_t *zlfid = (zfid_long_t *)fidp; uint64_t objsetid = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); zfs_exit(zfsvfs, FTAG); err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); if (err) return (SET_ERROR(EINVAL)); if ((err = zfs_enter(zfsvfs, FTAG)) != 0) return (err); } if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { zfid_short_t *zfid = (zfid_short_t *)fidp; for (i = 0; i < sizeof (zfid->zf_object); i++) object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); for (i = 0; i < sizeof (zfid->zf_gen); i++) fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (fidp->fid_len == LONG_FID_LEN && (fid_gen > 1 || setgen != 0)) { dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n", (u_longlong_t)fid_gen, (u_longlong_t)setgen); return (SET_ERROR(EINVAL)); } /* * A zero fid_gen means we are in .zfs or the .zfs/snapshot * directory tree. If the object == zfsvfs->z_shares_dir, then * we are in the .zfs/shares directory tree. */ if ((fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { zfs_exit(zfsvfs, FTAG); VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); if (object == ZFSCTL_INO_SNAPDIR) { cn.cn_nameptr = "snapshot"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN | LOCKLEAF; cn.cn_lkflags = flags; VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); vput(dvp); } else if (object == zfsvfs->z_shares_dir) { /* * XXX This branch must not be taken, * if it is, then the lookup below will * explode. */ cn.cn_nameptr = "shares"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN; cn.cn_lkflags = flags; VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); vput(dvp); } else { *vpp = dvp; } return (err); } gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object, (u_longlong_t)fid_gen, (u_longlong_t)gen_mask); if ((err = zfs_zget(zfsvfs, object, &zp))) { zfs_exit(zfsvfs, FTAG); return (err); } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, sizeof (uint64_t)); zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%llu) != fid gen (%llu)\n", (u_longlong_t)zp_gen, (u_longlong_t)fid_gen); vrele(ZTOV(zp)); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } *vpp = ZTOV(zp); zfs_exit(zfsvfs, FTAG); err = vn_lock(*vpp, flags); if (err == 0) vnode_create_vobject(*vpp, zp->z_size, curthread); else *vpp = NULL; return (err); } /* * Block out VOPs and close zfsvfs_t::z_os * * Note, if successful, then we return with the 'z_teardown_lock' and * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying * dataset and objset intact so that they can be atomically handed off during * a subsequent rollback or recv operation and the resume thereafter. */ int zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); return (0); } /* * Rebuild SA and release VOPs. Note that ownership of the underlying dataset * is an invariant across any of the operations that can be performed while the * filesystem was suspended. Whether it succeeded or failed, the preconditions * are the same: the relevant objset and associated dataset are owned by * zfsvfs, held, and long held on entry. */ int zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { int err; znode_t *zp; ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); /* * We already own this, so just update the objset_t, as the one we * had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_config_exit(dp, FTAG); err = zfsvfs_init(zfsvfs, os); if (err != 0) goto bail; ds->ds_dir->dd_activity_cancelled = B_FALSE; VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE)); zfs_set_fuid_feature(zfsvfs); /* * Attempt to re-establish all the active znodes with * their dbufs. If a zfs_rezget() fails, then we'll let * any potential callers discover that via zfs_enter_verify_zp * when they try to use their znode. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = list_next(&zfsvfs->z_all_znodes, zp)) { (void) zfs_rezget(zp); } mutex_exit(&zfsvfs->z_znodes_lock); bail: /* release the VOPs */ ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); if (err) { /* * Since we couldn't setup the sa framework, try to force * unmount this file system. */ if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { vfs_ref(zfsvfs->z_vfs); (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); } } return (err); } static void zfs_freevfs(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; zfsvfs_free(zfsvfs); atomic_dec_32(&zfs_active_fs_count); } #ifdef __i386__ static int desiredvnodes_backup; #include #include #include #include #include #endif static void zfs_vnodes_adjust(void) { #ifdef __i386__ int newdesiredvnodes; desiredvnodes_backup = desiredvnodes; /* * We calculate newdesiredvnodes the same way it is done in * vntblinit(). If it is equal to desiredvnodes, it means that * it wasn't tuned by the administrator and we can tune it down. */ newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * vm_kmem_size / (5 * (sizeof (struct vm_object) + sizeof (struct vnode)))); if (newdesiredvnodes == desiredvnodes) desiredvnodes = (3 * newdesiredvnodes) / 4; #endif } static void zfs_vnodes_adjust_back(void) { #ifdef __i386__ desiredvnodes = desiredvnodes_backup; #endif } void zfs_init(void) { printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); /* * Initialize .zfs directory structures */ zfsctl_init(); /* * Initialize znode cache, vnode ops, etc... */ zfs_znode_init(); /* * Reduce number of vnodes. Originally number of vnodes is calculated * with UFS inode in mind. We reduce it here, because it's too big for * ZFS/i386. */ zfs_vnodes_adjust(); dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); } void zfs_fini(void) { taskq_destroy(zfsvfs_taskq); zfsctl_fini(); zfs_znode_fini(); zfs_vnodes_adjust_back(); } int zfs_busy(void) { return (zfs_active_fs_count != 0); } /* * Release VOPs and unmount a suspended filesystem. */ int zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); /* * We already own this, so just hold and rele it to update the * objset_t, as the one we had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_config_exit(dp, FTAG); zfsvfs->z_os = os; /* release the VOPs */ ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); /* * Try to force unmount this file system. */ (void) zfs_umount(zfsvfs->z_vfs, 0); zfsvfs->z_unmounted = B_TRUE; return (0); } int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { int error; objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) return (SET_ERROR(EINVAL)); if (newvers < zfsvfs->z_version) return (SET_ERROR(EINVAL)); if (zfs_spa_version_map(newvers) > spa_version(dmu_objset_spa(zfsvfs->z_os))) return (SET_ERROR(ENOTSUP)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, ZFS_SA_ATTRS); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &newvers, tx); if (error) { dmu_tx_commit(tx); return (error); } if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { uint64_t sa_obj; ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, SPA_VERSION_SA); sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); VERIFY0(sa_set_sa_object(os, sa_obj)); sa_register_update_callback(os, zfs_sa_upgrade); } spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, "from %ju to %ju", (uintmax_t)zfsvfs->z_version, (uintmax_t)newvers); dmu_tx_commit(tx); zfsvfs->z_version = newvers; os->os_version = newvers; zfs_set_fuid_feature(zfsvfs); return (0); } /* * Read a property stored within the master node. */ int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) { uint64_t *cached_copy = NULL; /* * Figure out where in the objset_t the cached copy would live, if it * is available for the requested property. */ if (os != NULL) { switch (prop) { case ZFS_PROP_VERSION: cached_copy = &os->os_version; break; case ZFS_PROP_NORMALIZE: cached_copy = &os->os_normalization; break; case ZFS_PROP_UTF8ONLY: cached_copy = &os->os_utf8only; break; case ZFS_PROP_CASE: cached_copy = &os->os_casesensitivity; break; default: break; } } if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { *value = *cached_copy; return (0); } /* * If the property wasn't cached, look up the file system's value for * the property. For the version property, we look up a slightly * different string. */ const char *pname; int error = ENOENT; if (prop == ZFS_PROP_VERSION) { pname = ZPL_VERSION_STR; } else { pname = zfs_prop_to_name(prop); } if (os != NULL) { ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); } if (error == ENOENT) { /* No value set, use the default value */ switch (prop) { case ZFS_PROP_VERSION: *value = ZPL_VERSION; break; case ZFS_PROP_NORMALIZE: case ZFS_PROP_UTF8ONLY: *value = 0; break; case ZFS_PROP_CASE: *value = ZFS_CASE_SENSITIVE; break; case ZFS_PROP_ACLTYPE: *value = ZFS_ACLTYPE_NFSV4; break; default: return (error); } error = 0; } /* * If one of the methods for getting the property value above worked, * copy it into the objset_t's cache. */ if (error == 0 && cached_copy != NULL) { *cached_copy = *value; } return (error); } /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. * If this function returns true we know VFS unmount has been initiated. */ boolean_t zfs_get_vfs_flag_unmounted(objset_t *os) { zfsvfs_t *zfvp; boolean_t unmounted = B_FALSE; ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS); mutex_enter(&os->os_user_ptr_lock); zfvp = dmu_objset_get_user(os); if (zfvp != NULL && zfvp->z_vfs != NULL && (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT)) unmounted = B_TRUE; mutex_exit(&os->os_user_ptr_lock); return (unmounted); } #ifdef _KERNEL void zfsvfs_update_fromname(const char *oldname, const char *newname) { char tmpbuf[MAXPATHLEN]; struct mount *mp; char *fromname; size_t oldlen; oldlen = strlen(oldname); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { fromname = mp->mnt_stat.f_mntfromname; if (strcmp(fromname, oldname) == 0) { (void) strlcpy(fromname, newname, sizeof (mp->mnt_stat.f_mntfromname)); continue; } if (strncmp(fromname, oldname, oldlen) == 0 && (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s", newname, fromname + oldlen); (void) strlcpy(fromname, tmpbuf, sizeof (mp->mnt_stat.f_mntfromname)); continue; } } mtx_unlock(&mountlist_mtx); } #endif diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index efb8d0c30330..e355e2bfc3a0 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -1,1465 +1,1465 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #include #include #include #include #include #include #include #include #include #include /* * Within the scope of spl-kmem.c file the kmem_cache_* definitions * are removed to allow access to the real Linux slab allocator. */ #undef kmem_cache_destroy #undef kmem_cache_create #undef kmem_cache_alloc #undef kmem_cache_free /* * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}() * with smp_mb__{before,after}_atomic() because they were redundant. This is * only used inside our SLAB allocator, so we implement an internal wrapper * here to give us smp_mb__{before,after}_atomic() on older kernels. */ #ifndef smp_mb__before_atomic #define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x) #endif #ifndef smp_mb__after_atomic #define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x) #endif /* BEGIN CSTYLED */ /* * Cache magazines are an optimization designed to minimize the cost of * allocating memory. They do this by keeping a per-cpu cache of recently * freed objects, which can then be reallocated without taking a lock. This * can improve performance on highly contended caches. However, because * objects in magazines will prevent otherwise empty slabs from being * immediately released this may not be ideal for low memory machines. * * For this reason spl_kmem_cache_magazine_size can be used to set a maximum * magazine size. When this value is set to 0 the magazine size will be * automatically determined based on the object size. Otherwise magazines * will be limited to 2-256 objects per magazine (i.e per cpu). Magazines * may never be entirely disabled in this implementation. */ static unsigned int spl_kmem_cache_magazine_size = 0; module_param(spl_kmem_cache_magazine_size, uint, 0444); MODULE_PARM_DESC(spl_kmem_cache_magazine_size, "Default magazine size (2-256), set automatically (0)"); /* * The default behavior is to report the number of objects remaining in the * cache. This allows the Linux VM to repeatedly reclaim objects from the * cache when memory is low satisfy other memory allocations. Alternately, * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache * is reclaimed. This may increase the likelihood of out of memory events. */ static unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */; module_param(spl_kmem_cache_reclaim, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)"); static unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; module_param(spl_kmem_cache_obj_per_slab, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); static unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE; module_param(spl_kmem_cache_max_size, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); /* * For small objects the Linux slab allocator should be used to make the most * efficient use of the memory. However, large objects are not supported by * the Linux slab and therefore the SPL implementation is preferred. A cutoff * of 16K was determined to be optimal for architectures using 4K pages and * to also work well on architecutres using larger 64K page sizes. */ static unsigned int spl_kmem_cache_slab_limit = 16384; module_param(spl_kmem_cache_slab_limit, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_slab_limit, "Objects less than N bytes use the Linux slab"); /* * The number of threads available to allocate new slabs for caches. This * should not need to be tuned but it is available for performance analysis. */ static unsigned int spl_kmem_cache_kmem_threads = 4; module_param(spl_kmem_cache_kmem_threads, uint, 0444); MODULE_PARM_DESC(spl_kmem_cache_kmem_threads, "Number of spl_kmem_cache threads"); /* END CSTYLED */ /* * Slab allocation interfaces * * While the Linux slab implementation was inspired by the Solaris * implementation I cannot use it to emulate the Solaris APIs. I * require two features which are not provided by the Linux slab. * * 1) Constructors AND destructors. Recent versions of the Linux * kernel have removed support for destructors. This is a deal * breaker for the SPL which contains particularly expensive * initializers for mutex's, condition variables, etc. We also * require a minimal level of cleanup for these data types unlike * many Linux data types which do need to be explicitly destroyed. * * 2) Virtual address space backed slab. Callers of the Solaris slab * expect it to work well for both small are very large allocations. * Because of memory fragmentation the Linux slab which is backed * by kmalloc'ed memory performs very badly when confronted with * large numbers of large allocations. Basing the slab on the * virtual address space removes the need for contiguous pages * and greatly improve performance for large allocations. * * For these reasons, the SPL has its own slab implementation with * the needed features. It is not as highly optimized as either the * Solaris or Linux slabs, but it should get me most of what is * needed until it can be optimized or obsoleted by another approach. * * One serious concern I do have about this method is the relatively * small virtual address space on 32bit arches. This will seriously * constrain the size of the slab caches and their performance. */ struct list_head spl_kmem_cache_list; /* List of caches */ struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ taskq_t *spl_kmem_cache_taskq; /* Task queue for aging / reclaim */ static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); static void * kv_alloc(spl_kmem_cache_t *skc, int size, int flags) { gfp_t lflags = kmem_flags_convert(flags); void *ptr; ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM); /* Resulting allocated memory will be page aligned */ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); return (ptr); } static void kv_free(spl_kmem_cache_t *skc, void *ptr, int size) { ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); /* * The Linux direct reclaim path uses this out of band value to * determine if forward progress is being made. Normally this is * incremented by kmem_freepages() which is part of the various * Linux slab implementations. However, since we are using none * of that infrastructure we are responsible for incrementing it. */ if (current->reclaim_state) current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; vfree(ptr); } /* * Required space for each aligned sks. */ static inline uint32_t spl_sks_size(spl_kmem_cache_t *skc) { return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t), skc->skc_obj_align, uint32_t)); } /* * Required space for each aligned object. */ static inline uint32_t spl_obj_size(spl_kmem_cache_t *skc) { uint32_t align = skc->skc_obj_align; return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) + P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t)); } uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache) { return (cache->skc_obj_total); } EXPORT_SYMBOL(spl_kmem_cache_inuse); uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache) { return (cache->skc_obj_size); } EXPORT_SYMBOL(spl_kmem_cache_entry_size); /* * Lookup the spl_kmem_object_t for an object given that object. */ static inline spl_kmem_obj_t * spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj) { return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size, skc->skc_obj_align, uint32_t)); } /* * It's important that we pack the spl_kmem_obj_t structure and the * actual objects in to one large address space to minimize the number * of calls to the allocator. It is far better to do a few large * allocations and then subdivide it ourselves. Now which allocator * we use requires balancing a few trade offs. * * For small objects we use kmem_alloc() because as long as you are * only requesting a small number of pages (ideally just one) its cheap. * However, when you start requesting multiple pages with kmem_alloc() * it gets increasingly expensive since it requires contiguous pages. * For this reason we shift to vmem_alloc() for slabs of large objects * which removes the need for contiguous pages. We do not use * vmem_alloc() in all cases because there is significant locking * overhead in __get_vm_area_node(). This function takes a single * global lock when acquiring an available virtual address range which * serializes all vmem_alloc()'s for all slab caches. Using slightly * different allocation functions for small and large objects should * give us the best of both worlds. * * +------------------------+ * | spl_kmem_slab_t --+-+ | * | skc_obj_size <-+ | | * | spl_kmem_obj_t | | * | skc_obj_size <---+ | * | spl_kmem_obj_t | | * | ... v | * +------------------------+ */ static spl_kmem_slab_t * spl_slab_alloc(spl_kmem_cache_t *skc, int flags) { spl_kmem_slab_t *sks; void *base; uint32_t obj_size; base = kv_alloc(skc, skc->skc_slab_size, flags); if (base == NULL) return (NULL); sks = (spl_kmem_slab_t *)base; sks->sks_magic = SKS_MAGIC; sks->sks_objs = skc->skc_slab_objs; sks->sks_age = jiffies; sks->sks_cache = skc; INIT_LIST_HEAD(&sks->sks_list); INIT_LIST_HEAD(&sks->sks_free_list); sks->sks_ref = 0; obj_size = spl_obj_size(skc); for (int i = 0; i < sks->sks_objs; i++) { void *obj = base + spl_sks_size(skc) + (i * obj_size); ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); spl_kmem_obj_t *sko = spl_sko_from_obj(skc, obj); sko->sko_addr = obj; sko->sko_magic = SKO_MAGIC; sko->sko_slab = sks; INIT_LIST_HEAD(&sko->sko_list); list_add_tail(&sko->sko_list, &sks->sks_free_list); } return (sks); } /* * Remove a slab from complete or partial list, it must be called with * the 'skc->skc_lock' held but the actual free must be performed * outside the lock to prevent deadlocking on vmem addresses. */ static void spl_slab_free(spl_kmem_slab_t *sks, struct list_head *sks_list, struct list_head *sko_list) { spl_kmem_cache_t *skc; ASSERT(sks->sks_magic == SKS_MAGIC); ASSERT(sks->sks_ref == 0); skc = sks->sks_cache; ASSERT(skc->skc_magic == SKC_MAGIC); /* * Update slab/objects counters in the cache, then remove the * slab from the skc->skc_partial_list. Finally add the slab * and all its objects in to the private work lists where the * destructors will be called and the memory freed to the system. */ skc->skc_obj_total -= sks->sks_objs; skc->skc_slab_total--; list_del(&sks->sks_list); list_add(&sks->sks_list, sks_list); list_splice_init(&sks->sks_free_list, sko_list); } /* * Reclaim empty slabs at the end of the partial list. */ static void spl_slab_reclaim(spl_kmem_cache_t *skc) { spl_kmem_slab_t *sks = NULL, *m = NULL; spl_kmem_obj_t *sko = NULL, *n = NULL; LIST_HEAD(sks_list); LIST_HEAD(sko_list); /* * Empty slabs and objects must be moved to a private list so they * can be safely freed outside the spin lock. All empty slabs are * at the end of skc->skc_partial_list, therefore once a non-empty * slab is found we can stop scanning. */ spin_lock(&skc->skc_lock); list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list, sks_list) { if (sks->sks_ref > 0) break; spl_slab_free(sks, &sks_list, &sko_list); } spin_unlock(&skc->skc_lock); /* * The following two loops ensure all the object destructors are run, * and the slabs themselves are freed. This is all done outside the * skc->skc_lock since this allows the destructor to sleep, and * allows us to perform a conditional reschedule when a freeing a * large number of objects and slabs back to the system. */ list_for_each_entry_safe(sko, n, &sko_list, sko_list) { ASSERT(sko->sko_magic == SKO_MAGIC); } list_for_each_entry_safe(sks, m, &sks_list, sks_list) { ASSERT(sks->sks_magic == SKS_MAGIC); kv_free(skc, sks, skc->skc_slab_size); } } static spl_kmem_emergency_t * spl_emergency_search(struct rb_root *root, void *obj) { struct rb_node *node = root->rb_node; spl_kmem_emergency_t *ske; unsigned long address = (unsigned long)obj; while (node) { ske = container_of(node, spl_kmem_emergency_t, ske_node); if (address < ske->ske_obj) node = node->rb_left; else if (address > ske->ske_obj) node = node->rb_right; else return (ske); } return (NULL); } static int spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) { struct rb_node **new = &(root->rb_node), *parent = NULL; spl_kmem_emergency_t *ske_tmp; unsigned long address = ske->ske_obj; while (*new) { ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node); parent = *new; if (address < ske_tmp->ske_obj) new = &((*new)->rb_left); else if (address > ske_tmp->ske_obj) new = &((*new)->rb_right); else return (0); } rb_link_node(&ske->ske_node, parent, new); rb_insert_color(&ske->ske_node, root); return (1); } /* * Allocate a single emergency object and track it in a red black tree. */ static int spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) { gfp_t lflags = kmem_flags_convert(flags); spl_kmem_emergency_t *ske; int order = get_order(skc->skc_obj_size); int empty; /* Last chance use a partial slab if one now exists */ spin_lock(&skc->skc_lock); empty = list_empty(&skc->skc_partial_list); spin_unlock(&skc->skc_lock); if (!empty) return (-EEXIST); ske = kmalloc(sizeof (*ske), lflags); if (ske == NULL) return (-ENOMEM); ske->ske_obj = __get_free_pages(lflags, order); if (ske->ske_obj == 0) { kfree(ske); return (-ENOMEM); } spin_lock(&skc->skc_lock); empty = spl_emergency_insert(&skc->skc_emergency_tree, ske); if (likely(empty)) { skc->skc_obj_total++; skc->skc_obj_emergency++; if (skc->skc_obj_emergency > skc->skc_obj_emergency_max) skc->skc_obj_emergency_max = skc->skc_obj_emergency; } spin_unlock(&skc->skc_lock); if (unlikely(!empty)) { free_pages(ske->ske_obj, order); kfree(ske); return (-EINVAL); } *obj = (void *)ske->ske_obj; return (0); } /* * Locate the passed object in the red black tree and free it. */ static int spl_emergency_free(spl_kmem_cache_t *skc, void *obj) { spl_kmem_emergency_t *ske; int order = get_order(skc->skc_obj_size); spin_lock(&skc->skc_lock); ske = spl_emergency_search(&skc->skc_emergency_tree, obj); if (ske) { rb_erase(&ske->ske_node, &skc->skc_emergency_tree); skc->skc_obj_emergency--; skc->skc_obj_total--; } spin_unlock(&skc->skc_lock); if (ske == NULL) return (-ENOENT); free_pages(ske->ske_obj, order); kfree(ske); return (0); } /* * Release objects from the per-cpu magazine back to their slab. The flush * argument contains the max number of entries to remove from the magazine. */ static void spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) { spin_lock(&skc->skc_lock); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skm->skm_magic == SKM_MAGIC); int count = MIN(flush, skm->skm_avail); for (int i = 0; i < count; i++) spl_cache_shrink(skc, skm->skm_objs[i]); skm->skm_avail -= count; memmove(skm->skm_objs, &(skm->skm_objs[count]), sizeof (void *) * skm->skm_avail); spin_unlock(&skc->skc_lock); } /* * Size a slab based on the size of each aligned object plus spl_kmem_obj_t. * When on-slab we want to target spl_kmem_cache_obj_per_slab. However, * for very small objects we may end up with more than this so as not * to waste space in the minimal allocation of a single page. */ static int spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) { uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs; sks_size = spl_sks_size(skc); obj_size = spl_obj_size(skc); max_size = (spl_kmem_cache_max_size * 1024 * 1024); tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size); if (tgt_size <= max_size) { tgt_objs = (tgt_size - sks_size) / obj_size; } else { tgt_objs = (max_size - sks_size) / obj_size; tgt_size = (tgt_objs * obj_size) + sks_size; } if (tgt_objs == 0) return (-ENOSPC); *objs = tgt_objs; *size = tgt_size; return (0); } /* * Make a guess at reasonable per-cpu magazine size based on the size of * each object and the cost of caching N of them in each magazine. Long * term this should really adapt based on an observed usage heuristic. */ static int spl_magazine_size(spl_kmem_cache_t *skc) { uint32_t obj_size = spl_obj_size(skc); int size; if (spl_kmem_cache_magazine_size > 0) return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2)); /* Per-magazine sizes below assume a 4Kib page size */ if (obj_size > (PAGE_SIZE * 256)) size = 4; /* Minimum 4Mib per-magazine */ else if (obj_size > (PAGE_SIZE * 32)) size = 16; /* Minimum 2Mib per-magazine */ else if (obj_size > (PAGE_SIZE)) size = 64; /* Minimum 256Kib per-magazine */ else if (obj_size > (PAGE_SIZE / 4)) size = 128; /* Minimum 128Kib per-magazine */ else size = 256; return (size); } /* * Allocate a per-cpu magazine to associate with a specific core. */ static spl_kmem_magazine_t * spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) { spl_kmem_magazine_t *skm; int size = sizeof (spl_kmem_magazine_t) + sizeof (void *) * skc->skc_mag_size; skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (skm) { skm->skm_magic = SKM_MAGIC; skm->skm_avail = 0; skm->skm_size = skc->skc_mag_size; skm->skm_refill = skc->skc_mag_refill; skm->skm_cache = skc; skm->skm_cpu = cpu; } return (skm); } /* * Free a per-cpu magazine associated with a specific core. */ static void spl_magazine_free(spl_kmem_magazine_t *skm) { ASSERT(skm->skm_magic == SKM_MAGIC); ASSERT(skm->skm_avail == 0); kfree(skm); } /* * Create all pre-cpu magazines of reasonable sizes. */ static int spl_magazine_create(spl_kmem_cache_t *skc) { int i = 0; ASSERT((skc->skc_flags & KMC_SLAB) == 0); skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) * num_possible_cpus(), kmem_flags_convert(KM_SLEEP)); skc->skc_mag_size = spl_magazine_size(skc); skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2; for_each_possible_cpu(i) { skc->skc_mag[i] = spl_magazine_alloc(skc, i); if (!skc->skc_mag[i]) { for (i--; i >= 0; i--) spl_magazine_free(skc->skc_mag[i]); kfree(skc->skc_mag); return (-ENOMEM); } } return (0); } /* * Destroy all pre-cpu magazines. */ static void spl_magazine_destroy(spl_kmem_cache_t *skc) { spl_kmem_magazine_t *skm; int i = 0; ASSERT((skc->skc_flags & KMC_SLAB) == 0); for_each_possible_cpu(i) { skm = skc->skc_mag[i]; spl_cache_flush(skc, skm, skm->skm_avail); spl_magazine_free(skm); } kfree(skc->skc_mag); } /* * Create a object cache based on the following arguments: * name cache name * size cache object size * align cache object alignment * ctor cache object constructor * dtor cache object destructor * reclaim cache object reclaim * priv cache private data for ctor/dtor/reclaim * vmp unused must be NULL * flags * KMC_KVMEM Force kvmem backed SPL cache * KMC_SLAB Force Linux slab backed cache * KMC_NODEBUG Disable debugging (unsupported) */ spl_kmem_cache_t * spl_kmem_cache_create(const char *name, size_t size, size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, void *reclaim, void *priv, void *vmp, int flags) { gfp_t lflags = kmem_flags_convert(KM_SLEEP); spl_kmem_cache_t *skc; int rc; /* * Unsupported flags */ ASSERT(vmp == NULL); ASSERT(reclaim == NULL); might_sleep(); skc = kzalloc(sizeof (*skc), lflags); if (skc == NULL) return (NULL); skc->skc_magic = SKC_MAGIC; skc->skc_name_size = strlen(name) + 1; skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags); if (skc->skc_name == NULL) { kfree(skc); return (NULL); } - strncpy(skc->skc_name, name, skc->skc_name_size); + strlcpy(skc->skc_name, name, skc->skc_name_size); skc->skc_ctor = ctor; skc->skc_dtor = dtor; skc->skc_private = priv; skc->skc_vmp = vmp; skc->skc_linux_cache = NULL; skc->skc_flags = flags; skc->skc_obj_size = size; skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; atomic_set(&skc->skc_ref, 0); INIT_LIST_HEAD(&skc->skc_list); INIT_LIST_HEAD(&skc->skc_complete_list); INIT_LIST_HEAD(&skc->skc_partial_list); skc->skc_emergency_tree = RB_ROOT; spin_lock_init(&skc->skc_lock); init_waitqueue_head(&skc->skc_waitq); skc->skc_slab_fail = 0; skc->skc_slab_create = 0; skc->skc_slab_destroy = 0; skc->skc_slab_total = 0; skc->skc_slab_alloc = 0; skc->skc_slab_max = 0; skc->skc_obj_total = 0; skc->skc_obj_alloc = 0; skc->skc_obj_max = 0; skc->skc_obj_deadlock = 0; skc->skc_obj_emergency = 0; skc->skc_obj_emergency_max = 0; rc = percpu_counter_init_common(&skc->skc_linux_alloc, 0, GFP_KERNEL); if (rc != 0) { kfree(skc); return (NULL); } /* * Verify the requested alignment restriction is sane. */ if (align) { VERIFY(ISP2(align)); VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); VERIFY3U(align, <=, PAGE_SIZE); skc->skc_obj_align = align; } /* * When no specific type of slab is requested (kmem, vmem, or * linuxslab) then select a cache type based on the object size * and default tunables. */ if (!(skc->skc_flags & (KMC_SLAB | KMC_KVMEM))) { if (spl_kmem_cache_slab_limit && size <= (size_t)spl_kmem_cache_slab_limit) { /* * Objects smaller than spl_kmem_cache_slab_limit can * use the Linux slab for better space-efficiency. */ skc->skc_flags |= KMC_SLAB; } else { /* * All other objects are considered large and are * placed on kvmem backed slabs. */ skc->skc_flags |= KMC_KVMEM; } } /* * Given the type of slab allocate the required resources. */ if (skc->skc_flags & KMC_KVMEM) { rc = spl_slab_size(skc, &skc->skc_slab_objs, &skc->skc_slab_size); if (rc) goto out; rc = spl_magazine_create(skc); if (rc) goto out; } else { unsigned long slabflags = 0; if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) { rc = EINVAL; goto out; } #if defined(SLAB_USERCOPY) /* * Required for PAX-enabled kernels if the slab is to be * used for copying between user and kernel space. */ slabflags |= SLAB_USERCOPY; #endif #if defined(HAVE_KMEM_CACHE_CREATE_USERCOPY) /* * Newer grsec patchset uses kmem_cache_create_usercopy() * instead of SLAB_USERCOPY flag */ skc->skc_linux_cache = kmem_cache_create_usercopy( skc->skc_name, size, align, slabflags, 0, size, NULL); #else skc->skc_linux_cache = kmem_cache_create( skc->skc_name, size, align, slabflags, NULL); #endif if (skc->skc_linux_cache == NULL) { rc = ENOMEM; goto out; } } down_write(&spl_kmem_cache_sem); list_add_tail(&skc->skc_list, &spl_kmem_cache_list); up_write(&spl_kmem_cache_sem); return (skc); out: kfree(skc->skc_name); percpu_counter_destroy(&skc->skc_linux_alloc); kfree(skc); return (NULL); } EXPORT_SYMBOL(spl_kmem_cache_create); /* * Register a move callback for cache defragmentation. * XXX: Unimplemented but harmless to stub out for now. */ void spl_kmem_cache_set_move(spl_kmem_cache_t *skc, kmem_cbrc_t (move)(void *, void *, size_t, void *)) { ASSERT(move != NULL); } EXPORT_SYMBOL(spl_kmem_cache_set_move); /* * Destroy a cache and all objects associated with the cache. */ void spl_kmem_cache_destroy(spl_kmem_cache_t *skc) { DECLARE_WAIT_QUEUE_HEAD(wq); taskqid_t id; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skc->skc_flags & (KMC_KVMEM | KMC_SLAB)); down_write(&spl_kmem_cache_sem); list_del_init(&skc->skc_list); up_write(&spl_kmem_cache_sem); /* Cancel any and wait for any pending delayed tasks */ VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags)); spin_lock(&skc->skc_lock); id = skc->skc_taskqid; spin_unlock(&skc->skc_lock); taskq_cancel_id(spl_kmem_cache_taskq, id); /* * Wait until all current callers complete, this is mainly * to catch the case where a low memory situation triggers a * cache reaping action which races with this destroy. */ wait_event(wq, atomic_read(&skc->skc_ref) == 0); if (skc->skc_flags & KMC_KVMEM) { spl_magazine_destroy(skc); spl_slab_reclaim(skc); } else { ASSERT(skc->skc_flags & KMC_SLAB); kmem_cache_destroy(skc->skc_linux_cache); } spin_lock(&skc->skc_lock); /* * Validate there are no objects in use and free all the * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */ ASSERT3U(skc->skc_slab_alloc, ==, 0); ASSERT3U(skc->skc_obj_alloc, ==, 0); ASSERT3U(skc->skc_slab_total, ==, 0); ASSERT3U(skc->skc_obj_total, ==, 0); ASSERT3U(skc->skc_obj_emergency, ==, 0); ASSERT(list_empty(&skc->skc_complete_list)); ASSERT3U(percpu_counter_sum(&skc->skc_linux_alloc), ==, 0); percpu_counter_destroy(&skc->skc_linux_alloc); spin_unlock(&skc->skc_lock); kfree(skc->skc_name); kfree(skc); } EXPORT_SYMBOL(spl_kmem_cache_destroy); /* * Allocate an object from a slab attached to the cache. This is used to * repopulate the per-cpu magazine caches in batches when they run low. */ static void * spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) { spl_kmem_obj_t *sko; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(sks->sks_magic == SKS_MAGIC); sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list); ASSERT(sko->sko_magic == SKO_MAGIC); ASSERT(sko->sko_addr != NULL); /* Remove from sks_free_list */ list_del_init(&sko->sko_list); sks->sks_age = jiffies; sks->sks_ref++; skc->skc_obj_alloc++; /* Track max obj usage statistics */ if (skc->skc_obj_alloc > skc->skc_obj_max) skc->skc_obj_max = skc->skc_obj_alloc; /* Track max slab usage statistics */ if (sks->sks_ref == 1) { skc->skc_slab_alloc++; if (skc->skc_slab_alloc > skc->skc_slab_max) skc->skc_slab_max = skc->skc_slab_alloc; } return (sko->sko_addr); } /* * Generic slab allocation function to run by the global work queues. * It is responsible for allocating a new slab, linking it in to the list * of partial slabs, and then waking any waiters. */ static int __spl_cache_grow(spl_kmem_cache_t *skc, int flags) { spl_kmem_slab_t *sks; fstrans_cookie_t cookie = spl_fstrans_mark(); sks = spl_slab_alloc(skc, flags); spl_fstrans_unmark(cookie); spin_lock(&skc->skc_lock); if (sks) { skc->skc_slab_total++; skc->skc_obj_total += sks->sks_objs; list_add_tail(&sks->sks_list, &skc->skc_partial_list); smp_mb__before_atomic(); clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); smp_mb__after_atomic(); } spin_unlock(&skc->skc_lock); return (sks == NULL ? -ENOMEM : 0); } static void spl_cache_grow_work(void *data) { spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data; spl_kmem_cache_t *skc = ska->ska_cache; int error = __spl_cache_grow(skc, ska->ska_flags); atomic_dec(&skc->skc_ref); smp_mb__before_atomic(); clear_bit(KMC_BIT_GROWING, &skc->skc_flags); smp_mb__after_atomic(); if (error == 0) wake_up_all(&skc->skc_waitq); kfree(ska); } /* * Returns non-zero when a new slab should be available. */ static int spl_cache_grow_wait(spl_kmem_cache_t *skc) { return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags)); } /* * No available objects on any slabs, create a new slab. Note that this * functionality is disabled for KMC_SLAB caches which are backed by the * Linux slab. */ static int spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) { int remaining, rc = 0; ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT((skc->skc_flags & KMC_SLAB) == 0); might_sleep(); *obj = NULL; /* * Before allocating a new slab wait for any reaping to complete and * then return so the local magazine can be rechecked for new objects. */ if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING, TASK_UNINTERRUPTIBLE); return (rc ? rc : -EAGAIN); } /* * Note: It would be nice to reduce the overhead of context switch * and improve NUMA locality, by trying to allocate a new slab in the * current process context with KM_NOSLEEP flag. * * However, this can't be applied to vmem/kvmem due to a bug that * spl_vmalloc() doesn't honor gfp flags in page table allocation. */ /* * This is handled by dispatching a work request to the global work * queue. This allows us to asynchronously allocate a new slab while * retaining the ability to safely fall back to a smaller synchronous * allocations to ensure forward progress is always maintained. */ if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { spl_kmem_alloc_t *ska; ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags)); if (ska == NULL) { clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags); smp_mb__after_atomic(); wake_up_all(&skc->skc_waitq); return (-ENOMEM); } atomic_inc(&skc->skc_ref); ska->ska_cache = skc; ska->ska_flags = flags; taskq_init_ent(&ska->ska_tqe); taskq_dispatch_ent(spl_kmem_cache_taskq, spl_cache_grow_work, ska, 0, &ska->ska_tqe); } /* * The goal here is to only detect the rare case where a virtual slab * allocation has deadlocked. We must be careful to minimize the use * of emergency objects which are more expensive to track. Therefore, * we set a very long timeout for the asynchronous allocation and if * the timeout is reached the cache is flagged as deadlocked. From * this point only new emergency objects will be allocated until the * asynchronous allocation completes and clears the deadlocked flag. */ if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) { rc = spl_emergency_alloc(skc, flags, obj); } else { remaining = wait_event_timeout(skc->skc_waitq, spl_cache_grow_wait(skc), HZ / 10); if (!remaining) { spin_lock(&skc->skc_lock); if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) { set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); skc->skc_obj_deadlock++; } spin_unlock(&skc->skc_lock); } rc = -ENOMEM; } return (rc); } /* * Refill a per-cpu magazine with objects from the slabs for this cache. * Ideally the magazine can be repopulated using existing objects which have * been released, however if we are unable to locate enough free objects new * slabs of objects will be created. On success NULL is returned, otherwise * the address of a single emergency object is returned for use by the caller. */ static void * spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) { spl_kmem_slab_t *sks; int count = 0, rc, refill; void *obj = NULL; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skm->skm_magic == SKM_MAGIC); refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail); spin_lock(&skc->skc_lock); while (refill > 0) { /* No slabs available we may need to grow the cache */ if (list_empty(&skc->skc_partial_list)) { spin_unlock(&skc->skc_lock); local_irq_enable(); rc = spl_cache_grow(skc, flags, &obj); local_irq_disable(); /* Emergency object for immediate use by caller */ if (rc == 0 && obj != NULL) return (obj); if (rc) goto out; /* Rescheduled to different CPU skm is not local */ if (skm != skc->skc_mag[smp_processor_id()]) goto out; /* * Potentially rescheduled to the same CPU but * allocations may have occurred from this CPU while * we were sleeping so recalculate max refill. */ refill = MIN(refill, skm->skm_size - skm->skm_avail); spin_lock(&skc->skc_lock); continue; } /* Grab the next available slab */ sks = list_entry((&skc->skc_partial_list)->next, spl_kmem_slab_t, sks_list); ASSERT(sks->sks_magic == SKS_MAGIC); ASSERT(sks->sks_ref < sks->sks_objs); ASSERT(!list_empty(&sks->sks_free_list)); /* * Consume as many objects as needed to refill the requested * cache. We must also be careful not to overfill it. */ while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) { ASSERT(skm->skm_avail < skm->skm_size); ASSERT(count < skm->skm_size); skm->skm_objs[skm->skm_avail++] = spl_cache_obj(skc, sks); } /* Move slab to skc_complete_list when full */ if (sks->sks_ref == sks->sks_objs) { list_del(&sks->sks_list); list_add(&sks->sks_list, &skc->skc_complete_list); } } spin_unlock(&skc->skc_lock); out: return (NULL); } /* * Release an object back to the slab from which it came. */ static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) { spl_kmem_slab_t *sks = NULL; spl_kmem_obj_t *sko = NULL; ASSERT(skc->skc_magic == SKC_MAGIC); sko = spl_sko_from_obj(skc, obj); ASSERT(sko->sko_magic == SKO_MAGIC); sks = sko->sko_slab; ASSERT(sks->sks_magic == SKS_MAGIC); ASSERT(sks->sks_cache == skc); list_add(&sko->sko_list, &sks->sks_free_list); sks->sks_age = jiffies; sks->sks_ref--; skc->skc_obj_alloc--; /* * Move slab to skc_partial_list when no longer full. Slabs * are added to the head to keep the partial list is quasi-full * sorted order. Fuller at the head, emptier at the tail. */ if (sks->sks_ref == (sks->sks_objs - 1)) { list_del(&sks->sks_list); list_add(&sks->sks_list, &skc->skc_partial_list); } /* * Move empty slabs to the end of the partial list so * they can be easily found and freed during reclamation. */ if (sks->sks_ref == 0) { list_del(&sks->sks_list); list_add_tail(&sks->sks_list, &skc->skc_partial_list); skc->skc_slab_alloc--; } } /* * Allocate an object from the per-cpu magazine, or if the magazine * is empty directly allocate from a slab and repopulate the magazine. */ void * spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) { spl_kmem_magazine_t *skm; void *obj = NULL; ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); /* * Allocate directly from a Linux slab. All optimizations are left * to the underlying cache we only need to guarantee that KM_SLEEP * callers will never fail. */ if (skc->skc_flags & KMC_SLAB) { struct kmem_cache *slc = skc->skc_linux_cache; do { obj = kmem_cache_alloc(slc, kmem_flags_convert(flags)); } while ((obj == NULL) && !(flags & KM_NOSLEEP)); if (obj != NULL) { /* * Even though we leave everything up to the * underlying cache we still keep track of * how many objects we've allocated in it for * better debuggability. */ percpu_counter_inc(&skc->skc_linux_alloc); } goto ret; } local_irq_disable(); restart: /* * Safe to update per-cpu structure without lock, but * in the restart case we must be careful to reacquire * the local magazine since this may have changed * when we need to grow the cache. */ skm = skc->skc_mag[smp_processor_id()]; ASSERT(skm->skm_magic == SKM_MAGIC); if (likely(skm->skm_avail)) { /* Object available in CPU cache, use it */ obj = skm->skm_objs[--skm->skm_avail]; } else { obj = spl_cache_refill(skc, skm, flags); if ((obj == NULL) && !(flags & KM_NOSLEEP)) goto restart; local_irq_enable(); goto ret; } local_irq_enable(); ASSERT(obj); ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); ret: /* Pre-emptively migrate object to CPU L1 cache */ if (obj) { if (obj && skc->skc_ctor) skc->skc_ctor(obj, skc->skc_private, flags); else prefetchw(obj); } return (obj); } EXPORT_SYMBOL(spl_kmem_cache_alloc); /* * Free an object back to the local per-cpu magazine, there is no * guarantee that this is the same magazine the object was originally * allocated from. We may need to flush entire from the magazine * back to the slabs to make space. */ void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) { spl_kmem_magazine_t *skm; unsigned long flags; int do_reclaim = 0; int do_emergency = 0; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); /* * Run the destructor */ if (skc->skc_dtor) skc->skc_dtor(obj, skc->skc_private); /* * Free the object from the Linux underlying Linux slab. */ if (skc->skc_flags & KMC_SLAB) { kmem_cache_free(skc->skc_linux_cache, obj); percpu_counter_dec(&skc->skc_linux_alloc); return; } /* * While a cache has outstanding emergency objects all freed objects * must be checked. However, since emergency objects will never use * a virtual address these objects can be safely excluded as an * optimization. */ if (!is_vmalloc_addr(obj)) { spin_lock(&skc->skc_lock); do_emergency = (skc->skc_obj_emergency > 0); spin_unlock(&skc->skc_lock); if (do_emergency && (spl_emergency_free(skc, obj) == 0)) return; } local_irq_save(flags); /* * Safe to update per-cpu structure without lock, but * no remote memory allocation tracking is being performed * it is entirely possible to allocate an object from one * CPU cache and return it to another. */ skm = skc->skc_mag[smp_processor_id()]; ASSERT(skm->skm_magic == SKM_MAGIC); /* * Per-CPU cache full, flush it to make space for this object, * this may result in an empty slab which can be reclaimed once * interrupts are re-enabled. */ if (unlikely(skm->skm_avail >= skm->skm_size)) { spl_cache_flush(skc, skm, skm->skm_refill); do_reclaim = 1; } /* Available space in cache, use it */ skm->skm_objs[skm->skm_avail++] = obj; local_irq_restore(flags); if (do_reclaim) spl_slab_reclaim(skc); } EXPORT_SYMBOL(spl_kmem_cache_free); /* * Depending on how many and which objects are released it may simply * repopulate the local magazine which will then need to age-out. Objects * which cannot fit in the magazine will be released back to their slabs * which will also need to age out before being released. This is all just * best effort and we do not want to thrash creating and destroying slabs. */ void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc) { ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); if (skc->skc_flags & KMC_SLAB) return; atomic_inc(&skc->skc_ref); /* * Prevent concurrent cache reaping when contended. */ if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) goto out; /* Reclaim from the magazine and free all now empty slabs. */ unsigned long irq_flags; local_irq_save(irq_flags); spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; spl_cache_flush(skc, skm, skm->skm_avail); local_irq_restore(irq_flags); spl_slab_reclaim(skc); clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags); smp_mb__after_atomic(); wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); out: atomic_dec(&skc->skc_ref); } EXPORT_SYMBOL(spl_kmem_cache_reap_now); /* * This is stubbed out for code consistency with other platforms. There * is existing logic to prevent concurrent reaping so while this is ugly * it should do no harm. */ int spl_kmem_cache_reap_active(void) { return (0); } EXPORT_SYMBOL(spl_kmem_cache_reap_active); /* * Reap all free slabs from all registered caches. */ void spl_kmem_reap(void) { spl_kmem_cache_t *skc = NULL; down_read(&spl_kmem_cache_sem); list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { spl_kmem_cache_reap_now(skc); } up_read(&spl_kmem_cache_sem); } EXPORT_SYMBOL(spl_kmem_reap); int spl_kmem_cache_init(void) { init_rwsem(&spl_kmem_cache_sem); INIT_LIST_HEAD(&spl_kmem_cache_list); spl_kmem_cache_taskq = taskq_create("spl_kmem_cache", spl_kmem_cache_kmem_threads, maxclsyspri, spl_kmem_cache_kmem_threads * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); if (spl_kmem_cache_taskq == NULL) return (-ENOMEM); return (0); } void spl_kmem_cache_fini(void) { taskq_destroy(spl_kmem_cache_taskq); } diff --git a/module/os/linux/spl/spl-kstat.c b/module/os/linux/spl/spl-kstat.c index c6d3c8f4413f..4308581147a9 100644 --- a/module/os/linux/spl/spl-kstat.c +++ b/module/os/linux/spl/spl-kstat.c @@ -1,715 +1,715 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . * * Solaris Porting Layer (SPL) Kstat Implementation. * * Links to Illumos.org for more information on kstat function: * [1] https://illumos.org/man/1M/kstat * [2] https://illumos.org/man/9f/kstat_create */ #include #include #include #include #include static kmutex_t kstat_module_lock; static struct list_head kstat_module_list; static kid_t kstat_id; static int kstat_resize_raw(kstat_t *ksp) { if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX) return (ENOMEM); vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX); ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); return (0); } static int kstat_seq_show_headers(struct seq_file *f) { kstat_t *ksp = (kstat_t *)f->private; int rc = 0; ASSERT(ksp->ks_magic == KS_MAGIC); seq_printf(f, "%d %d 0x%02x %d %d %lld %lld\n", ksp->ks_kid, ksp->ks_type, ksp->ks_flags, ksp->ks_ndata, (int)ksp->ks_data_size, ksp->ks_crtime, ksp->ks_snaptime); switch (ksp->ks_type) { case KSTAT_TYPE_RAW: restart: if (ksp->ks_raw_ops.headers) { rc = ksp->ks_raw_ops.headers( ksp->ks_raw_buf, ksp->ks_raw_bufsize); if (rc == ENOMEM && !kstat_resize_raw(ksp)) goto restart; if (!rc) seq_puts(f, ksp->ks_raw_buf); } else { seq_printf(f, "raw data\n"); } break; case KSTAT_TYPE_NAMED: seq_printf(f, "%-31s %-4s %s\n", "name", "type", "data"); break; case KSTAT_TYPE_INTR: seq_printf(f, "%-8s %-8s %-8s %-8s %-8s\n", "hard", "soft", "watchdog", "spurious", "multsvc"); break; case KSTAT_TYPE_IO: seq_printf(f, "%-8s %-8s %-8s %-8s %-8s %-8s " "%-8s %-8s %-8s %-8s %-8s %-8s\n", "nread", "nwritten", "reads", "writes", "wtime", "wlentime", "wupdate", "rtime", "rlentime", "rupdate", "wcnt", "rcnt"); break; case KSTAT_TYPE_TIMER: seq_printf(f, "%-31s %-8s " "%-8s %-8s %-8s %-8s %-8s\n", "name", "events", "elapsed", "min", "max", "start", "stop"); break; default: PANIC("Undefined kstat type %d\n", ksp->ks_type); } return (-rc); } static int kstat_seq_show_raw(struct seq_file *f, unsigned char *p, int l) { int i, j; for (i = 0; ; i++) { seq_printf(f, "%03x:", i); for (j = 0; j < 16; j++) { if (i * 16 + j >= l) { seq_printf(f, "\n"); goto out; } seq_printf(f, " %02x", (unsigned char)p[i * 16 + j]); } seq_printf(f, "\n"); } out: return (0); } static int kstat_seq_show_named(struct seq_file *f, kstat_named_t *knp) { seq_printf(f, "%-31s %-4d ", knp->name, knp->data_type); switch (knp->data_type) { case KSTAT_DATA_CHAR: knp->value.c[15] = '\0'; /* NULL terminate */ seq_printf(f, "%-16s", knp->value.c); break; /* * NOTE - We need to be more careful able what tokens are * used for each arch, for now this is correct for x86_64. */ case KSTAT_DATA_INT32: seq_printf(f, "%d", knp->value.i32); break; case KSTAT_DATA_UINT32: seq_printf(f, "%u", knp->value.ui32); break; case KSTAT_DATA_INT64: seq_printf(f, "%lld", (signed long long)knp->value.i64); break; case KSTAT_DATA_UINT64: seq_printf(f, "%llu", (unsigned long long)knp->value.ui64); break; case KSTAT_DATA_LONG: seq_printf(f, "%ld", knp->value.l); break; case KSTAT_DATA_ULONG: seq_printf(f, "%lu", knp->value.ul); break; case KSTAT_DATA_STRING: KSTAT_NAMED_STR_PTR(knp) [KSTAT_NAMED_STR_BUFLEN(knp)-1] = '\0'; seq_printf(f, "%s", KSTAT_NAMED_STR_PTR(knp)); break; default: PANIC("Undefined kstat data type %d\n", knp->data_type); } seq_printf(f, "\n"); return (0); } static int kstat_seq_show_intr(struct seq_file *f, kstat_intr_t *kip) { seq_printf(f, "%-8u %-8u %-8u %-8u %-8u\n", kip->intrs[KSTAT_INTR_HARD], kip->intrs[KSTAT_INTR_SOFT], kip->intrs[KSTAT_INTR_WATCHDOG], kip->intrs[KSTAT_INTR_SPURIOUS], kip->intrs[KSTAT_INTR_MULTSVC]); return (0); } static int kstat_seq_show_io(struct seq_file *f, kstat_io_t *kip) { /* though wlentime & friends are signed, they will never be negative */ seq_printf(f, "%-8llu %-8llu %-8u %-8u %-8llu %-8llu " "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n", kip->nread, kip->nwritten, kip->reads, kip->writes, kip->wtime, kip->wlentime, kip->wlastupdate, kip->rtime, kip->rlentime, kip->rlastupdate, kip->wcnt, kip->rcnt); return (0); } static int kstat_seq_show_timer(struct seq_file *f, kstat_timer_t *ktp) { seq_printf(f, "%-31s %-8llu %-8llu %-8llu %-8llu %-8llu %-8llu\n", ktp->name, ktp->num_events, ktp->elapsed_time, ktp->min_time, ktp->max_time, ktp->start_time, ktp->stop_time); return (0); } static int kstat_seq_show(struct seq_file *f, void *p) { kstat_t *ksp = (kstat_t *)f->private; int rc = 0; ASSERT(ksp->ks_magic == KS_MAGIC); switch (ksp->ks_type) { case KSTAT_TYPE_RAW: restart: if (ksp->ks_raw_ops.data) { rc = ksp->ks_raw_ops.data( ksp->ks_raw_buf, ksp->ks_raw_bufsize, p); if (rc == ENOMEM && !kstat_resize_raw(ksp)) goto restart; if (!rc) seq_puts(f, ksp->ks_raw_buf); } else { ASSERT(ksp->ks_ndata == 1); rc = kstat_seq_show_raw(f, ksp->ks_data, ksp->ks_data_size); } break; case KSTAT_TYPE_NAMED: rc = kstat_seq_show_named(f, (kstat_named_t *)p); break; case KSTAT_TYPE_INTR: rc = kstat_seq_show_intr(f, (kstat_intr_t *)p); break; case KSTAT_TYPE_IO: rc = kstat_seq_show_io(f, (kstat_io_t *)p); break; case KSTAT_TYPE_TIMER: rc = kstat_seq_show_timer(f, (kstat_timer_t *)p); break; default: PANIC("Undefined kstat type %d\n", ksp->ks_type); } return (-rc); } static int kstat_default_update(kstat_t *ksp, int rw) { ASSERT(ksp != NULL); if (rw == KSTAT_WRITE) return (EACCES); return (0); } static void * kstat_seq_data_addr(kstat_t *ksp, loff_t n) { void *rc = NULL; switch (ksp->ks_type) { case KSTAT_TYPE_RAW: if (ksp->ks_raw_ops.addr) rc = ksp->ks_raw_ops.addr(ksp, n); else rc = ksp->ks_data; break; case KSTAT_TYPE_NAMED: rc = ksp->ks_data + n * sizeof (kstat_named_t); break; case KSTAT_TYPE_INTR: rc = ksp->ks_data + n * sizeof (kstat_intr_t); break; case KSTAT_TYPE_IO: rc = ksp->ks_data + n * sizeof (kstat_io_t); break; case KSTAT_TYPE_TIMER: rc = ksp->ks_data + n * sizeof (kstat_timer_t); break; default: PANIC("Undefined kstat type %d\n", ksp->ks_type); } return (rc); } static void * kstat_seq_start(struct seq_file *f, loff_t *pos) { loff_t n = *pos; kstat_t *ksp = (kstat_t *)f->private; ASSERT(ksp->ks_magic == KS_MAGIC); mutex_enter(ksp->ks_lock); if (ksp->ks_type == KSTAT_TYPE_RAW) { ksp->ks_raw_bufsize = PAGE_SIZE; ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); } /* Dynamically update kstat, on error existing kstats are used */ (void) ksp->ks_update(ksp, KSTAT_READ); ksp->ks_snaptime = gethrtime(); if (!(ksp->ks_flags & KSTAT_FLAG_NO_HEADERS) && !n && kstat_seq_show_headers(f)) return (NULL); if (n >= ksp->ks_ndata) return (NULL); return (kstat_seq_data_addr(ksp, n)); } static void * kstat_seq_next(struct seq_file *f, void *p, loff_t *pos) { kstat_t *ksp = (kstat_t *)f->private; ASSERT(ksp->ks_magic == KS_MAGIC); ++*pos; if (*pos >= ksp->ks_ndata) return (NULL); return (kstat_seq_data_addr(ksp, *pos)); } static void kstat_seq_stop(struct seq_file *f, void *v) { kstat_t *ksp = (kstat_t *)f->private; ASSERT(ksp->ks_magic == KS_MAGIC); if (ksp->ks_type == KSTAT_TYPE_RAW) vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); mutex_exit(ksp->ks_lock); } static const struct seq_operations kstat_seq_ops = { .show = kstat_seq_show, .start = kstat_seq_start, .next = kstat_seq_next, .stop = kstat_seq_stop, }; static kstat_module_t * kstat_find_module(char *name) { kstat_module_t *module = NULL; list_for_each_entry(module, &kstat_module_list, ksm_module_list) { if (strncmp(name, module->ksm_name, KSTAT_STRLEN) == 0) return (module); } return (NULL); } static kstat_module_t * kstat_create_module(char *name) { kstat_module_t *module; struct proc_dir_entry *pde; pde = proc_mkdir(name, proc_spl_kstat); if (pde == NULL) return (NULL); module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP); module->ksm_proc = pde; - strlcpy(module->ksm_name, name, KSTAT_STRLEN+1); + strlcpy(module->ksm_name, name, KSTAT_STRLEN); INIT_LIST_HEAD(&module->ksm_kstat_list); list_add_tail(&module->ksm_module_list, &kstat_module_list); return (module); } static void kstat_delete_module(kstat_module_t *module) { ASSERT(list_empty(&module->ksm_kstat_list)); remove_proc_entry(module->ksm_name, proc_spl_kstat); list_del(&module->ksm_module_list); kmem_free(module, sizeof (kstat_module_t)); } static int proc_kstat_open(struct inode *inode, struct file *filp) { struct seq_file *f; int rc; rc = seq_open(filp, &kstat_seq_ops); if (rc) return (rc); f = filp->private_data; f->private = SPL_PDE_DATA(inode); return (0); } static ssize_t proc_kstat_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct seq_file *f = filp->private_data; kstat_t *ksp = f->private; int rc; ASSERT(ksp->ks_magic == KS_MAGIC); mutex_enter(ksp->ks_lock); rc = ksp->ks_update(ksp, KSTAT_WRITE); mutex_exit(ksp->ks_lock); if (rc) return (-rc); *ppos += len; return (len); } static const kstat_proc_op_t proc_kstat_operations = { #ifdef HAVE_PROC_OPS_STRUCT .proc_open = proc_kstat_open, .proc_write = proc_kstat_write, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release, #else .open = proc_kstat_open, .write = proc_kstat_write, .read = seq_read, .llseek = seq_lseek, .release = seq_release, #endif }; void __kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), int (*data)(char *buf, size_t size, void *data), void *(*addr)(kstat_t *ksp, loff_t index)) { ksp->ks_raw_ops.headers = headers; ksp->ks_raw_ops.data = data; ksp->ks_raw_ops.addr = addr; } EXPORT_SYMBOL(__kstat_set_raw_ops); void kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module, const char *name) { kpep->kpe_owner = NULL; kpep->kpe_proc = NULL; INIT_LIST_HEAD(&kpep->kpe_list); - strncpy(kpep->kpe_module, module, KSTAT_STRLEN); - strncpy(kpep->kpe_name, name, KSTAT_STRLEN); + strlcpy(kpep->kpe_module, module, sizeof (kpep->kpe_module)); + strlcpy(kpep->kpe_name, name, sizeof (kpep->kpe_name)); } EXPORT_SYMBOL(kstat_proc_entry_init); kstat_t * __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, const char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags) { kstat_t *ksp; ASSERT(ks_module); ASSERT(ks_instance == 0); ASSERT(ks_name); if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO)) ASSERT(ks_ndata == 1); ksp = kmem_zalloc(sizeof (*ksp), KM_SLEEP); if (ksp == NULL) return (ksp); mutex_enter(&kstat_module_lock); ksp->ks_kid = kstat_id; kstat_id++; mutex_exit(&kstat_module_lock); ksp->ks_magic = KS_MAGIC; mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL); ksp->ks_lock = &ksp->ks_private_lock; ksp->ks_crtime = gethrtime(); ksp->ks_snaptime = ksp->ks_crtime; ksp->ks_instance = ks_instance; - strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN); + strlcpy(ksp->ks_class, ks_class, sizeof (ksp->ks_class)); ksp->ks_type = ks_type; ksp->ks_flags = ks_flags; ksp->ks_update = kstat_default_update; ksp->ks_private = NULL; ksp->ks_raw_ops.headers = NULL; ksp->ks_raw_ops.data = NULL; ksp->ks_raw_ops.addr = NULL; ksp->ks_raw_buf = NULL; ksp->ks_raw_bufsize = 0; kstat_proc_entry_init(&ksp->ks_proc, ks_module, ks_name); switch (ksp->ks_type) { case KSTAT_TYPE_RAW: ksp->ks_ndata = 1; ksp->ks_data_size = ks_ndata; break; case KSTAT_TYPE_NAMED: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t); break; case KSTAT_TYPE_INTR: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t); break; case KSTAT_TYPE_IO: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t); break; case KSTAT_TYPE_TIMER: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t); break; default: PANIC("Undefined kstat type %d\n", ksp->ks_type); } if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) { ksp->ks_data = NULL; } else { ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP); if (ksp->ks_data == NULL) { kmem_free(ksp, sizeof (*ksp)); ksp = NULL; } } return (ksp); } EXPORT_SYMBOL(__kstat_create); static int kstat_detect_collision(kstat_proc_entry_t *kpep) { kstat_module_t *module; kstat_proc_entry_t *tmp = NULL; char *parent; char *cp; parent = kmem_asprintf("%s", kpep->kpe_module); if ((cp = strrchr(parent, '/')) == NULL) { kmem_strfree(parent); return (0); } cp[0] = '\0'; if ((module = kstat_find_module(parent)) != NULL) { list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) { if (strncmp(tmp->kpe_name, cp+1, KSTAT_STRLEN) == 0) { kmem_strfree(parent); return (EEXIST); } } } kmem_strfree(parent); return (0); } /* * Add a file to the proc filesystem under the kstat namespace (i.e. * /proc/spl/kstat/). The file need not necessarily be implemented as a * kstat. */ void kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode, const kstat_proc_op_t *proc_ops, void *data) { kstat_module_t *module; kstat_proc_entry_t *tmp = NULL; ASSERT(kpep); mutex_enter(&kstat_module_lock); module = kstat_find_module(kpep->kpe_module); if (module == NULL) { if (kstat_detect_collision(kpep) != 0) { cmn_err(CE_WARN, "kstat_create('%s', '%s'): namespace" \ " collision", kpep->kpe_module, kpep->kpe_name); goto out; } module = kstat_create_module(kpep->kpe_module); if (module == NULL) goto out; } /* * Only one entry by this name per-module, on failure the module * shouldn't be deleted because we know it has at least one entry. */ list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) { if (strncmp(tmp->kpe_name, kpep->kpe_name, KSTAT_STRLEN) == 0) goto out; } list_add_tail(&kpep->kpe_list, &module->ksm_kstat_list); kpep->kpe_owner = module; kpep->kpe_proc = proc_create_data(kpep->kpe_name, mode, module->ksm_proc, proc_ops, data); if (kpep->kpe_proc == NULL) { list_del_init(&kpep->kpe_list); if (list_empty(&module->ksm_kstat_list)) kstat_delete_module(module); } out: mutex_exit(&kstat_module_lock); } EXPORT_SYMBOL(kstat_proc_entry_install); void __kstat_install(kstat_t *ksp) { ASSERT(ksp); mode_t mode; /* Specify permission modes for different kstats */ if (strncmp(ksp->ks_proc.kpe_name, "dbufs", KSTAT_STRLEN) == 0) { mode = 0600; } else { mode = 0644; } kstat_proc_entry_install( &ksp->ks_proc, mode, &proc_kstat_operations, ksp); } EXPORT_SYMBOL(__kstat_install); void kstat_proc_entry_delete(kstat_proc_entry_t *kpep) { kstat_module_t *module = kpep->kpe_owner; if (kpep->kpe_proc) remove_proc_entry(kpep->kpe_name, module->ksm_proc); mutex_enter(&kstat_module_lock); list_del_init(&kpep->kpe_list); /* * Remove top level module directory if it wasn't empty before, but now * is. */ if (kpep->kpe_proc && list_empty(&module->ksm_kstat_list)) kstat_delete_module(module); mutex_exit(&kstat_module_lock); } EXPORT_SYMBOL(kstat_proc_entry_delete); void __kstat_delete(kstat_t *ksp) { kstat_proc_entry_delete(&ksp->ks_proc); if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL)) kmem_free(ksp->ks_data, ksp->ks_data_size); ksp->ks_lock = NULL; mutex_destroy(&ksp->ks_private_lock); kmem_free(ksp, sizeof (*ksp)); } EXPORT_SYMBOL(__kstat_delete); int spl_kstat_init(void) { mutex_init(&kstat_module_lock, NULL, MUTEX_DEFAULT, NULL); INIT_LIST_HEAD(&kstat_module_list); kstat_id = 0; return (0); } void spl_kstat_fini(void) { ASSERT(list_empty(&kstat_module_list)); mutex_destroy(&kstat_module_lock); } diff --git a/module/os/linux/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c index 32a2d34b1d93..b863945a1c59 100644 --- a/module/os/linux/spl/spl-thread.c +++ b/module/os/linux/spl/spl-thread.c @@ -1,207 +1,207 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . * * Solaris Porting Layer (SPL) Thread Implementation. */ #include #include #include /* * Thread interfaces */ typedef struct thread_priv_s { unsigned long tp_magic; /* Magic */ int tp_name_size; /* Name size */ char *tp_name; /* Name (without _thread suffix) */ void (*tp_func)(void *); /* Registered function */ void *tp_args; /* Args to be passed to function */ size_t tp_len; /* Len to be passed to function */ int tp_state; /* State to start thread at */ pri_t tp_pri; /* Priority to start threat at */ } thread_priv_t; static int thread_generic_wrapper(void *arg) { thread_priv_t *tp = (thread_priv_t *)arg; void (*func)(void *); void *args; ASSERT(tp->tp_magic == TP_MAGIC); func = tp->tp_func; args = tp->tp_args; set_current_state(tp->tp_state); set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri)); kmem_free(tp->tp_name, tp->tp_name_size); kmem_free(tp, sizeof (thread_priv_t)); if (func) func(args); return (0); } /* * thread_create() may block forever if it cannot create a thread or * allocate memory. This is preferable to returning a NULL which Solaris * style callers likely never check for... since it can't fail. */ kthread_t * __thread_create(caddr_t stk, size_t stksize, thread_func_t func, const char *name, void *args, size_t len, proc_t *pp, int state, pri_t pri) { thread_priv_t *tp; struct task_struct *tsk; char *p; /* Option pp is simply ignored */ /* Variable stack size unsupported */ ASSERT(stk == NULL); tp = kmem_alloc(sizeof (thread_priv_t), KM_PUSHPAGE); if (tp == NULL) return (NULL); tp->tp_magic = TP_MAGIC; tp->tp_name_size = strlen(name) + 1; tp->tp_name = kmem_alloc(tp->tp_name_size, KM_PUSHPAGE); if (tp->tp_name == NULL) { kmem_free(tp, sizeof (thread_priv_t)); return (NULL); } - strncpy(tp->tp_name, name, tp->tp_name_size); + strlcpy(tp->tp_name, name, tp->tp_name_size); /* * Strip trailing "_thread" from passed name which will be the func * name since the exposed API has no parameter for passing a name. */ p = strstr(tp->tp_name, "_thread"); if (p) p[0] = '\0'; tp->tp_func = func; tp->tp_args = args; tp->tp_len = len; tp->tp_state = state; tp->tp_pri = pri; tsk = spl_kthread_create(thread_generic_wrapper, (void *)tp, "%s", tp->tp_name); if (IS_ERR(tsk)) return (NULL); wake_up_process(tsk); return ((kthread_t *)tsk); } EXPORT_SYMBOL(__thread_create); /* * spl_kthread_create - Wrapper providing pre-3.13 semantics for * kthread_create() in which it is not killable and less likely * to return -ENOMEM. */ struct task_struct * spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...) { struct task_struct *tsk; va_list args; char name[TASK_COMM_LEN]; va_start(args, namefmt); vsnprintf(name, sizeof (name), namefmt, args); va_end(args); do { tsk = kthread_create(func, data, "%s", name); if (IS_ERR(tsk)) { if (signal_pending(current)) { clear_thread_flag(TIF_SIGPENDING); continue; } if (PTR_ERR(tsk) == -ENOMEM) continue; return (NULL); } else { return (tsk); } } while (1); } EXPORT_SYMBOL(spl_kthread_create); /* * The "why" argument indicates the allowable side-effects of the call: * * FORREAL: Extract the next pending signal from p_sig into p_cursig; * stop the process if a stop has been requested or if a traced signal * is pending. * * JUSTLOOKING: Don't stop the process, just indicate whether or not * a signal might be pending (FORREAL is needed to tell for sure). */ int issig(int why) { ASSERT(why == FORREAL || why == JUSTLOOKING); if (!signal_pending(current)) return (0); if (why != FORREAL) return (1); struct task_struct *task = current; spl_kernel_siginfo_t __info; sigset_t set; siginitsetinv(&set, 1ULL << (SIGSTOP - 1) | 1ULL << (SIGTSTP - 1)); sigorsets(&set, &task->blocked, &set); spin_lock_irq(&task->sighand->siglock); int ret; #ifdef HAVE_DEQUEUE_SIGNAL_4ARG enum pid_type __type; if ((ret = dequeue_signal(task, &set, &__info, &__type)) != 0) { #else if ((ret = dequeue_signal(task, &set, &__info)) != 0) { #endif #ifdef HAVE_SIGNAL_STOP spin_unlock_irq(&task->sighand->siglock); kernel_signal_stop(); #else if (current->jobctl & JOBCTL_STOP_DEQUEUED) spl_set_special_state(TASK_STOPPED); spin_unlock_irq(¤t->sighand->siglock); schedule(); #endif return (0); } spin_unlock_irq(&task->sighand->siglock); return (1); } EXPORT_SYMBOL(issig); diff --git a/module/os/linux/spl/spl-zone.c b/module/os/linux/spl/spl-zone.c index 234ae7f6cd0c..9421f81bf0c8 100644 --- a/module/os/linux/spl/spl-zone.c +++ b/module/os/linux/spl/spl-zone.c @@ -1,424 +1,423 @@ /* * Copyright (c) 2021 Klara Systems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #if defined(CONFIG_USER_NS) #include #include #endif static kmutex_t zone_datasets_lock; static struct list_head zone_datasets; typedef struct zone_datasets { struct list_head zds_list; /* zone_datasets linkage */ struct user_namespace *zds_userns; /* namespace reference */ struct list_head zds_datasets; /* datasets for the namespace */ } zone_datasets_t; typedef struct zone_dataset { struct list_head zd_list; /* zone_dataset linkage */ size_t zd_dsnamelen; /* length of name */ char zd_dsname[0]; /* name of the member dataset */ } zone_dataset_t; #if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) /* * Returns: * - 0 on success * - EBADF if it cannot open the provided file descriptor * - ENOTTY if the file itself is a not a user namespace file. We want to * intercept this error in the ZFS layer. We cannot just return one of the * ZFS_ERR_* errors here as we want to preserve the seperation of the ZFS * and the SPL layers. */ static int user_ns_get(int fd, struct user_namespace **userns) { struct kstatfs st; struct file *nsfile; struct ns_common *ns; int error; if ((nsfile = fget(fd)) == NULL) return (EBADF); if (vfs_statfs(&nsfile->f_path, &st) != 0) { error = ENOTTY; goto done; } if (st.f_type != NSFS_MAGIC) { error = ENOTTY; goto done; } ns = get_proc_ns(file_inode(nsfile)); if (ns->ops->type != CLONE_NEWUSER) { error = ENOTTY; goto done; } *userns = container_of(ns, struct user_namespace, ns); error = 0; done: fput(nsfile); return (error); } #endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */ static unsigned int user_ns_zoneid(struct user_namespace *user_ns) { unsigned int r; #if defined(HAVE_USER_NS_COMMON_INUM) r = user_ns->ns.inum; #else r = user_ns->proc_inum; #endif return (r); } static struct zone_datasets * zone_datasets_lookup(unsigned int nsinum) { zone_datasets_t *zds; list_for_each_entry(zds, &zone_datasets, zds_list) { if (user_ns_zoneid(zds->zds_userns) == nsinum) return (zds); } return (NULL); } #if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) static struct zone_dataset * zone_dataset_lookup(zone_datasets_t *zds, const char *dataset, size_t dsnamelen) { zone_dataset_t *zd; list_for_each_entry(zd, &zds->zds_datasets, zd_list) { if (zd->zd_dsnamelen != dsnamelen) continue; if (strncmp(zd->zd_dsname, dataset, dsnamelen) == 0) return (zd); } return (NULL); } static int zone_dataset_cred_check(cred_t *cred) { if (!uid_eq(cred->uid, GLOBAL_ROOT_UID)) return (EPERM); return (0); } #endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */ static int zone_dataset_name_check(const char *dataset, size_t *dsnamelen) { if (dataset[0] == '\0' || dataset[0] == '/') return (ENOENT); *dsnamelen = strlen(dataset); /* Ignore trailing slash, if supplied. */ if (dataset[*dsnamelen - 1] == '/') (*dsnamelen)--; return (0); } int zone_dataset_attach(cred_t *cred, const char *dataset, int userns_fd) { #if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) struct user_namespace *userns; zone_datasets_t *zds; zone_dataset_t *zd; int error; size_t dsnamelen; if ((error = zone_dataset_cred_check(cred)) != 0) return (error); if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0) return (error); if ((error = user_ns_get(userns_fd, &userns)) != 0) return (error); mutex_enter(&zone_datasets_lock); zds = zone_datasets_lookup(user_ns_zoneid(userns)); if (zds == NULL) { zds = kmem_alloc(sizeof (zone_datasets_t), KM_SLEEP); INIT_LIST_HEAD(&zds->zds_list); INIT_LIST_HEAD(&zds->zds_datasets); zds->zds_userns = userns; /* * Lock the namespace by incresing its refcount to prevent * the namespace ID from being reused. */ get_user_ns(userns); list_add_tail(&zds->zds_list, &zone_datasets); } else { zd = zone_dataset_lookup(zds, dataset, dsnamelen); if (zd != NULL) { mutex_exit(&zone_datasets_lock); return (EEXIST); } } zd = kmem_alloc(sizeof (zone_dataset_t) + dsnamelen + 1, KM_SLEEP); zd->zd_dsnamelen = dsnamelen; - strncpy(zd->zd_dsname, dataset, dsnamelen); - zd->zd_dsname[dsnamelen] = '\0'; + strlcpy(zd->zd_dsname, dataset, dsnamelen + 1); INIT_LIST_HEAD(&zd->zd_list); list_add_tail(&zd->zd_list, &zds->zds_datasets); mutex_exit(&zone_datasets_lock); return (0); #else return (ENXIO); #endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */ } EXPORT_SYMBOL(zone_dataset_attach); int zone_dataset_detach(cred_t *cred, const char *dataset, int userns_fd) { #if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) struct user_namespace *userns; zone_datasets_t *zds; zone_dataset_t *zd; int error; size_t dsnamelen; if ((error = zone_dataset_cred_check(cred)) != 0) return (error); if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0) return (error); if ((error = user_ns_get(userns_fd, &userns)) != 0) return (error); mutex_enter(&zone_datasets_lock); zds = zone_datasets_lookup(user_ns_zoneid(userns)); if (zds != NULL) zd = zone_dataset_lookup(zds, dataset, dsnamelen); if (zds == NULL || zd == NULL) { mutex_exit(&zone_datasets_lock); return (ENOENT); } list_del(&zd->zd_list); kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1); /* Prune the namespace entry if it has no more delegations. */ if (list_empty(&zds->zds_datasets)) { /* * Decrease the refcount now that the namespace is no longer * used. It is no longer necessary to prevent the namespace ID * from being reused. */ put_user_ns(userns); list_del(&zds->zds_list); kmem_free(zds, sizeof (*zds)); } mutex_exit(&zone_datasets_lock); return (0); #else return (ENXIO); #endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */ } EXPORT_SYMBOL(zone_dataset_detach); /* * A dataset is visible if: * - It is a parent of a namespace entry. * - It is one of the namespace entries. * - It is a child of a namespace entry. * * A dataset is writable if: * - It is one of the namespace entries. * - It is a child of a namespace entry. * * The parent datasets of namespace entries are visible and * read-only to provide a path back to the root of the pool. */ int zone_dataset_visible(const char *dataset, int *write) { zone_datasets_t *zds; zone_dataset_t *zd; size_t dsnamelen, zd_len; int visible; /* Default to read-only, in case visible is returned. */ if (write != NULL) *write = 0; if (zone_dataset_name_check(dataset, &dsnamelen) != 0) return (0); if (INGLOBALZONE(curproc)) { if (write != NULL) *write = 1; return (1); } mutex_enter(&zone_datasets_lock); zds = zone_datasets_lookup(crgetzoneid(curproc->cred)); if (zds == NULL) { mutex_exit(&zone_datasets_lock); return (0); } visible = 0; list_for_each_entry(zd, &zds->zds_datasets, zd_list) { zd_len = strlen(zd->zd_dsname); if (zd_len > dsnamelen) { /* * The name of the namespace entry is longer than that * of the dataset, so it could be that the dataset is a * parent of the namespace entry. */ visible = memcmp(zd->zd_dsname, dataset, dsnamelen) == 0 && zd->zd_dsname[dsnamelen] == '/'; if (visible) break; } else if (zd_len == dsnamelen) { /* * The name of the namespace entry is as long as that * of the dataset, so perhaps the dataset itself is the * namespace entry. */ visible = memcmp(zd->zd_dsname, dataset, zd_len) == 0; if (visible) { if (write != NULL) *write = 1; break; } } else { /* * The name of the namespace entry is shorter than that * of the dataset, so perhaps the dataset is a child of * the namespace entry. */ visible = memcmp(zd->zd_dsname, dataset, zd_len) == 0 && dataset[zd_len] == '/'; if (visible) { if (write != NULL) *write = 1; break; } } } mutex_exit(&zone_datasets_lock); return (visible); } EXPORT_SYMBOL(zone_dataset_visible); unsigned int global_zoneid(void) { unsigned int z = 0; #if defined(CONFIG_USER_NS) z = user_ns_zoneid(&init_user_ns); #endif return (z); } EXPORT_SYMBOL(global_zoneid); unsigned int crgetzoneid(const cred_t *cr) { unsigned int r = 0; #if defined(CONFIG_USER_NS) r = user_ns_zoneid(cr->user_ns); #endif return (r); } EXPORT_SYMBOL(crgetzoneid); boolean_t inglobalzone(proc_t *proc) { #if defined(CONFIG_USER_NS) return (proc->cred->user_ns == &init_user_ns); #else return (B_TRUE); #endif } EXPORT_SYMBOL(inglobalzone); int spl_zone_init(void) { mutex_init(&zone_datasets_lock, NULL, MUTEX_DEFAULT, NULL); INIT_LIST_HEAD(&zone_datasets); return (0); } void spl_zone_fini(void) { zone_datasets_t *zds; zone_dataset_t *zd; /* * It would be better to assert an empty zone_datasets, but since * there's no automatic mechanism for cleaning them up if the user * namespace is destroyed, just do it here, since spl is about to go * out of context. */ while (!list_empty(&zone_datasets)) { zds = list_entry(zone_datasets.next, zone_datasets_t, zds_list); while (!list_empty(&zds->zds_datasets)) { zd = list_entry(zds->zds_datasets.next, zone_dataset_t, zd_list); list_del(&zd->zd_list); kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1); } put_user_ns(zds->zds_userns); list_del(&zds->zds_list); kmem_free(zds, sizeof (*zds)); } mutex_destroy(&zone_datasets_lock); } diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index b328959464b2..d93c7f08c1c2 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -1,2478 +1,2476 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 Martin Matuska. All rights reserved. * Copyright (c) 2014 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2018, loli10K . All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" /* * Filesystem and Snapshot Limits * ------------------------------ * * These limits are used to restrict the number of filesystems and/or snapshots * that can be created at a given level in the tree or below. A typical * use-case is with a delegated dataset where the administrator wants to ensure * that a user within the zone is not creating too many additional filesystems * or snapshots, even though they're not exceeding their space quota. * * The filesystem and snapshot counts are stored as extensible properties. This * capability is controlled by a feature flag and must be enabled to be used. * Once enabled, the feature is not active until the first limit is set. At * that point, future operations to create/destroy filesystems or snapshots * will validate and update the counts. * * Because the count properties will not exist before the feature is active, * the counts are updated when a limit is first set on an uninitialized * dsl_dir node in the tree (The filesystem/snapshot count on a node includes * all of the nested filesystems/snapshots. Thus, a new leaf node has a * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and * snapshot count properties on a node indicate uninitialized counts on that * node.) When first setting a limit on an uninitialized node, the code starts * at the filesystem with the new limit and descends into all sub-filesystems * to add the count properties. * * In practice this is lightweight since a limit is typically set when the * filesystem is created and thus has no children. Once valid, changing the * limit value won't require a re-traversal since the counts are already valid. * When recursively fixing the counts, if a node with a limit is encountered * during the descent, the counts are known to be valid and there is no need to * descend into that filesystem's children. The counts on filesystems above the * one with the new limit will still be uninitialized, unless a limit is * eventually set on one of those filesystems. The counts are always recursively * updated when a limit is set on a dataset, unless there is already a limit. * When a new limit value is set on a filesystem with an existing limit, it is * possible for the new limit to be less than the current count at that level * since a user who can change the limit is also allowed to exceed the limit. * * Once the feature is active, then whenever a filesystem or snapshot is * created, the code recurses up the tree, validating the new count against the * limit at each initialized level. In practice, most levels will not have a * limit set. If there is a limit at any initialized level up the tree, the * check must pass or the creation will fail. Likewise, when a filesystem or * snapshot is destroyed, the counts are recursively adjusted all the way up * the initialized nodes in the tree. Renaming a filesystem into different point * in the tree will first validate, then update the counts on each branch up to * the common ancestor. A receive will also validate the counts and then update * them. * * An exception to the above behavior is that the limit is not enforced if the * user has permission to modify the limit. This is primarily so that * recursive snapshots in the global zone always work. We want to prevent a * denial-of-service in which a lower level delegated dataset could max out its * limit and thus block recursive snapshots from being taken in the global zone. * Because of this, it is possible for the snapshot count to be over the limit * and snapshots taken in the global zone could cause a lower level dataset to * hit or exceed its limit. The administrator taking the global zone recursive * snapshot should be aware of this side-effect and behave accordingly. * For consistency, the filesystem limit is also not enforced if the user can * modify the limit. * * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check() * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by * dsl_dir_init_fs_ss_count(). */ static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); typedef struct ddulrt_arg { dsl_dir_t *ddulrta_dd; uint64_t ddlrta_txg; } ddulrt_arg_t; static void dsl_dir_evict_async(void *dbu) { dsl_dir_t *dd = dbu; int t; dsl_pool_t *dp __maybe_unused = dd->dd_pool; dd->dd_dbuf = NULL; for (t = 0; t < TXG_SIZE; t++) { ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); ASSERT(dd->dd_tempreserved[t] == 0); ASSERT(dd->dd_space_towrite[t] == 0); } if (dd->dd_parent) dsl_dir_async_rele(dd->dd_parent, dd); spa_async_close(dd->dd_pool->dp_spa, dd); if (dsl_deadlist_is_open(&dd->dd_livelist)) dsl_dir_livelist_close(dd); dsl_prop_fini(dd); cv_destroy(&dd->dd_activity_cv); mutex_destroy(&dd->dd_activity_lock); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); } int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, const char *tail, const void *tag, dsl_dir_t **ddp) { dmu_buf_t *dbuf; dsl_dir_t *dd; dmu_object_info_t doi; int err; ASSERT(dsl_pool_config_held(dp)); err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); if (err != 0) return (err); dd = dmu_buf_get_user(dbuf); dmu_object_info_from_db(dbuf, &doi); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR); ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); if (dd == NULL) { dsl_dir_t *winner; dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); dd->dd_object = ddobj; dd->dd_dbuf = dbuf; dd->dd_pool = dp; mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dd->dd_activity_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dd->dd_activity_cv, NULL, CV_DEFAULT, NULL); dsl_prop_init(dd); if (dsl_dir_is_zapified(dd)) { err = zap_lookup(dp->dp_meta_objset, ddobj, DD_FIELD_CRYPTO_KEY_OBJ, sizeof (uint64_t), 1, &dd->dd_crypto_obj); if (err == 0) { /* check for on-disk format errata */ if (dsl_dir_incompatible_encryption_version( dd)) { dp->dp_spa->spa_errata = ZPOOL_ERRATA_ZOL_6845_ENCRYPTION; } } else if (err != ENOENT) { goto errout; } } if (dsl_dir_phys(dd)->dd_parent_obj) { err = dsl_dir_hold_obj(dp, dsl_dir_phys(dd)->dd_parent_obj, NULL, dd, &dd->dd_parent); if (err != 0) goto errout; if (tail) { #ifdef ZFS_DEBUG uint64_t foundobj; err = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(dd->dd_parent)-> dd_child_dir_zapobj, tail, sizeof (foundobj), 1, &foundobj); ASSERT(err || foundobj == ddobj); #endif (void) strlcpy(dd->dd_myname, tail, sizeof (dd->dd_myname)); } else { err = zap_value_search(dp->dp_meta_objset, dsl_dir_phys(dd->dd_parent)-> dd_child_dir_zapobj, ddobj, 0, dd->dd_myname); } if (err != 0) goto errout; } else { (void) strlcpy(dd->dd_myname, spa_name(dp->dp_spa), sizeof (dd->dd_myname)); } if (dsl_dir_is_clone(dd)) { dmu_buf_t *origin_bonus; dsl_dataset_phys_t *origin_phys; /* * We can't open the origin dataset, because * that would require opening this dsl_dir. * Just look at its phys directly instead. */ err = dmu_bonus_hold(dp->dp_meta_objset, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_bonus); if (err != 0) goto errout; origin_phys = origin_bonus->db_data; dd->dd_origin_txg = origin_phys->ds_creation_txg; dmu_buf_rele(origin_bonus, FTAG); if (dsl_dir_is_zapified(dd)) { uint64_t obj; err = zap_lookup(dp->dp_meta_objset, dd->dd_object, DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj); if (err == 0) dsl_dir_livelist_open(dd, obj); else if (err != ENOENT) goto errout; } } if (dsl_dir_is_zapified(dd)) { inode_timespec_t t = {0}; (void) zap_lookup(dp->dp_meta_objset, ddobj, DD_FIELD_SNAPSHOTS_CHANGED, sizeof (uint64_t), sizeof (inode_timespec_t) / sizeof (uint64_t), &t); dd->dd_snap_cmtime = t; } dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async, &dd->dd_dbuf); winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); if (winner != NULL) { if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); if (dsl_deadlist_is_open(&dd->dd_livelist)) dsl_dir_livelist_close(dd); dsl_prop_fini(dd); cv_destroy(&dd->dd_activity_cv); mutex_destroy(&dd->dd_activity_lock); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dd = winner; } else { spa_open_ref(dp->dp_spa, dd); } } /* * The dsl_dir_t has both open-to-close and instantiate-to-evict * holds on the spa. We need the open-to-close holds because * otherwise the spa_refcnt wouldn't change when we open a * dir which the spa also has open, so we could incorrectly * think it was OK to unload/export/destroy the pool. We need * the instantiate-to-evict hold because the dsl_dir_t has a * pointer to the dd_pool, which has a pointer to the spa_t. */ spa_open_ref(dp->dp_spa, tag); ASSERT3P(dd->dd_pool, ==, dp); ASSERT3U(dd->dd_object, ==, ddobj); ASSERT3P(dd->dd_dbuf, ==, dbuf); *ddp = dd; return (0); errout: if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); if (dsl_deadlist_is_open(&dd->dd_livelist)) dsl_dir_livelist_close(dd); dsl_prop_fini(dd); cv_destroy(&dd->dd_activity_cv); mutex_destroy(&dd->dd_activity_lock); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dmu_buf_rele(dbuf, tag); return (err); } void dsl_dir_rele(dsl_dir_t *dd, const void *tag) { dprintf_dd(dd, "%s\n", ""); spa_close(dd->dd_pool->dp_spa, tag); dmu_buf_rele(dd->dd_dbuf, tag); } /* * Remove a reference to the given dsl dir that is being asynchronously * released. Async releases occur from a taskq performing eviction of * dsl datasets and dirs. This process is identical to a normal release * with the exception of using the async API for releasing the reference on * the spa. */ void dsl_dir_async_rele(dsl_dir_t *dd, const void *tag) { dprintf_dd(dd, "%s\n", ""); spa_async_close(dd->dd_pool->dp_spa, tag); dmu_buf_rele(dd->dd_dbuf, tag); } /* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */ void dsl_dir_name(dsl_dir_t *dd, char *buf) { if (dd->dd_parent) { dsl_dir_name(dd->dd_parent, buf); VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); } else { buf[0] = '\0'; } if (!MUTEX_HELD(&dd->dd_lock)) { /* * recursive mutex so that we can use * dprintf_dd() with dd_lock held */ mutex_enter(&dd->dd_lock); VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); mutex_exit(&dd->dd_lock); } else { VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); } } /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */ int dsl_dir_namelen(dsl_dir_t *dd) { int result = 0; if (dd->dd_parent) { /* parent's name + 1 for the "/" */ result = dsl_dir_namelen(dd->dd_parent) + 1; } if (!MUTEX_HELD(&dd->dd_lock)) { /* see dsl_dir_name */ mutex_enter(&dd->dd_lock); result += strlen(dd->dd_myname); mutex_exit(&dd->dd_lock); } else { result += strlen(dd->dd_myname); } return (result); } static int getcomponent(const char *path, char *component, const char **nextp) { char *p; if ((path == NULL) || (path[0] == '\0')) return (SET_ERROR(ENOENT)); /* This would be a good place to reserve some namespace... */ p = strpbrk(path, "/@"); if (p && (p[1] == '/' || p[1] == '@')) { /* two separators in a row */ return (SET_ERROR(EINVAL)); } if (p == NULL || p == path) { /* * if the first thing is an @ or /, it had better be an * @ and it had better not have any more ats or slashes, * and it had better have something after the @. */ if (p != NULL && (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) return (SET_ERROR(EINVAL)); if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); (void) strlcpy(component, path, ZFS_MAX_DATASET_NAME_LEN); p = NULL; } else if (p[0] == '/') { if (p - path >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); - (void) strncpy(component, path, p - path); - component[p - path] = '\0'; + (void) strlcpy(component, path, p - path + 1); p++; } else if (p[0] == '@') { /* * if the next separator is an @, there better not be * any more slashes. */ if (strchr(path, '/')) return (SET_ERROR(EINVAL)); if (p - path >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); - (void) strncpy(component, path, p - path); - component[p - path] = '\0'; + (void) strlcpy(component, path, p - path + 1); } else { panic("invalid p=%p", (void *)p); } *nextp = p; return (0); } /* * Return the dsl_dir_t, and possibly the last component which couldn't * be found in *tail. The name must be in the specified dsl_pool_t. This * thread must hold the dp_config_rwlock for the pool. Returns NULL if the * path is bogus, or if tail==NULL and we couldn't parse the whole name. * (*tail)[0] == '@' means that the last component is a snapshot. */ int dsl_dir_hold(dsl_pool_t *dp, const char *name, const void *tag, dsl_dir_t **ddp, const char **tailp) { char *buf; const char *spaname, *next, *nextnext = NULL; int err; dsl_dir_t *dd; uint64_t ddobj; buf = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); err = getcomponent(name, buf, &next); if (err != 0) goto error; /* Make sure the name is in the specified pool. */ spaname = spa_name(dp->dp_spa); if (strcmp(buf, spaname) != 0) { err = SET_ERROR(EXDEV); goto error; } ASSERT(dsl_pool_config_held(dp)); err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); if (err != 0) { goto error; } while (next != NULL) { dsl_dir_t *child_dd; err = getcomponent(next, buf, &nextnext); if (err != 0) break; ASSERT(next[0] != '\0'); if (next[0] == '@') break; dprintf("looking up %s in obj%lld\n", buf, (longlong_t)dsl_dir_phys(dd)->dd_child_dir_zapobj); err = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj, buf, sizeof (ddobj), 1, &ddobj); if (err != 0) { if (err == ENOENT) err = 0; break; } err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd); if (err != 0) break; dsl_dir_rele(dd, tag); dd = child_dd; next = nextnext; } if (err != 0) { dsl_dir_rele(dd, tag); goto error; } /* * It's an error if there's more than one component left, or * tailp==NULL and there's any component left. */ if (next != NULL && (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { /* bad path name */ dsl_dir_rele(dd, tag); dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); err = SET_ERROR(ENOENT); } if (tailp != NULL) *tailp = next; if (err == 0) *ddp = dd; error: kmem_free(buf, ZFS_MAX_DATASET_NAME_LEN); return (err); } /* * If the counts are already initialized for this filesystem and its * descendants then do nothing, otherwise initialize the counts. * * The counts on this filesystem, and those below, may be uninitialized due to * either the use of a pre-existing pool which did not support the * filesystem/snapshot limit feature, or one in which the feature had not yet * been enabled. * * Recursively descend the filesystem tree and update the filesystem/snapshot * counts on each filesystem below, then update the cumulative count on the * current filesystem. If the filesystem already has a count set on it, * then we know that its counts, and the counts on the filesystems below it, * are already correct, so we don't have to update this filesystem. */ static void dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx) { uint64_t my_fs_cnt = 0; uint64_t my_ss_cnt = 0; dsl_pool_t *dp = dd->dd_pool; objset_t *os = dp->dp_meta_objset; zap_cursor_t *zc; zap_attribute_t *za; dsl_dataset_t *ds; ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)); ASSERT(dsl_pool_config_held(dp)); ASSERT(dmu_tx_is_syncing(tx)); dsl_dir_zapify(dd, tx); /* * If the filesystem count has already been initialized then we * don't need to recurse down any further. */ if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) return; zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* Iterate my child dirs */ for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { dsl_dir_t *chld_dd; uint64_t count; VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG, &chld_dd)); /* * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets. */ if (chld_dd->dd_myname[0] == '$') { dsl_dir_rele(chld_dd, FTAG); continue; } my_fs_cnt++; /* count this child */ dsl_dir_init_fs_ss_count(chld_dd, tx); VERIFY0(zap_lookup(os, chld_dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count)); my_fs_cnt += count; VERIFY0(zap_lookup(os, chld_dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count)); my_ss_cnt += count; dsl_dir_rele(chld_dd, FTAG); } zap_cursor_fini(zc); /* Count my snapshots (we counted children's snapshots above) */ VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds)); for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { /* Don't count temporary snapshots */ if (za->za_name[0] != '%') my_ss_cnt++; } zap_cursor_fini(zc); dsl_dataset_rele(ds, FTAG); kmem_free(zc, sizeof (zap_cursor_t)); kmem_free(za, sizeof (zap_attribute_t)); /* we're in a sync task, update counts */ dmu_buf_will_dirty(dd->dd_dbuf, tx); VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (my_fs_cnt), 1, &my_fs_cnt, tx)); VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (my_ss_cnt), 1, &my_ss_cnt, tx)); } static int dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx) { char *ddname = (char *)arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; dsl_dir_t *dd; int error; error = dsl_dataset_hold(dp, ddname, FTAG, &ds); if (error != 0) return (error); if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOTSUP)); } dd = ds->ds_dir; if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) && dsl_dir_is_zapified(dd) && zap_contains(dp->dp_meta_objset, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EALREADY)); } dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx) { char *ddname = (char *)arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; spa_t *spa; VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds)); spa = dsl_dataset_get_spa(ds); if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) { /* * Since the feature was not active and we're now setting a * limit, increment the feature-active counter so that the * feature becomes active for the first time. * * We are already in a sync task so we can update the MOS. */ spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx); } /* * Since we are now setting a non-UINT64_MAX limit on the filesystem, * we need to ensure the counts are correct. Descend down the tree from * this point and update all of the counts to be accurate. */ dsl_dir_init_fs_ss_count(ds->ds_dir, tx); dsl_dataset_rele(ds, FTAG); } /* * Make sure the feature is enabled and activate it if necessary. * Since we're setting a limit, ensure the on-disk counts are valid. * This is only called by the ioctl path when setting a limit value. * * We do not need to validate the new limit, since users who can change the * limit are also allowed to exceed the limit. */ int dsl_dir_activate_fs_ss_limit(const char *ddname) { int error; error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check, dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0, ZFS_SPACE_CHECK_RESERVED); if (error == EALREADY) error = 0; return (error); } /* * Used to determine if the filesystem_limit or snapshot_limit should be * enforced. We allow the limit to be exceeded if the user has permission to * write the property value. We pass in the creds that we got in the open * context since we will always be the GZ root in syncing context. We also have * to handle the case where we are allowed to change the limit on the current * dataset, but there may be another limit in the tree above. * * We can never modify these two properties within a non-global zone. In * addition, the other checks are modeled on zfs_secpolicy_write_perms. We * can't use that function since we are already holding the dp_config_rwlock. * In addition, we already have the dd and dealing with snapshots is simplified * in this code. */ typedef enum { ENFORCE_ALWAYS, ENFORCE_NEVER, ENFORCE_ABOVE } enforce_res_t; static enforce_res_t dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr, proc_t *proc) { enforce_res_t enforce = ENFORCE_ALWAYS; uint64_t obj; dsl_dataset_t *ds; uint64_t zoned; const char *zonedstr; ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || prop == ZFS_PROP_SNAPSHOT_LIMIT); #ifdef _KERNEL if (crgetzoneid(cr) != GLOBAL_ZONEID) return (ENFORCE_ALWAYS); /* * We are checking the saved credentials of the user process, which is * not the current process. Note that we can't use secpolicy_zfs(), * because it only works if the cred is that of the current process (on * Linux). */ if (secpolicy_zfs_proc(cr, proc) == 0) return (ENFORCE_NEVER); #else (void) proc; #endif if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0) return (ENFORCE_ALWAYS); ASSERT(dsl_pool_config_held(dd->dd_pool)); if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0) return (ENFORCE_ALWAYS); zonedstr = zfs_prop_to_name(ZFS_PROP_ZONED); if (dsl_prop_get_ds(ds, zonedstr, 8, 1, &zoned, NULL) || zoned) { /* Only root can access zoned fs's from the GZ */ enforce = ENFORCE_ALWAYS; } else { if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0) enforce = ENFORCE_ABOVE; } dsl_dataset_rele(ds, FTAG); return (enforce); } /* * Check if adding additional child filesystem(s) would exceed any filesystem * limits or adding additional snapshot(s) would exceed any snapshot limits. * The prop argument indicates which limit to check. * * Note that all filesystem limits up to the root (or the highest * initialized) filesystem or the given ancestor must be satisfied. */ int dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, dsl_dir_t *ancestor, cred_t *cr, proc_t *proc) { objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t limit, count; const char *count_prop; enforce_res_t enforce; int err = 0; ASSERT(dsl_pool_config_held(dd->dd_pool)); ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || prop == ZFS_PROP_SNAPSHOT_LIMIT); /* * If we're allowed to change the limit, don't enforce the limit * e.g. this can happen if a snapshot is taken by an administrative * user in the global zone (i.e. a recursive snapshot by root). * However, we must handle the case of delegated permissions where we * are allowed to change the limit on the current dataset, but there * is another limit in the tree above. */ enforce = dsl_enforce_ds_ss_limits(dd, prop, cr, proc); if (enforce == ENFORCE_NEVER) return (0); /* * e.g. if renaming a dataset with no snapshots, count adjustment * is 0. */ if (delta == 0) return (0); if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { /* * We don't enforce the limit for temporary snapshots. This is * indicated by a NULL cred_t argument. */ if (cr == NULL) return (0); count_prop = DD_FIELD_SNAPSHOT_COUNT; } else { count_prop = DD_FIELD_FILESYSTEM_COUNT; } /* * If an ancestor has been provided, stop checking the limit once we * hit that dir. We need this during rename so that we don't overcount * the check once we recurse up to the common ancestor. */ if (ancestor == dd) return (0); /* * If we hit an uninitialized node while recursing up the tree, we can * stop since we know there is no limit here (or above). The counts are * not valid on this node and we know we won't touch this node's counts. */ if (!dsl_dir_is_zapified(dd)) return (0); err = zap_lookup(os, dd->dd_object, count_prop, sizeof (count), 1, &count); if (err == ENOENT) return (0); if (err != 0) return (err); err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL, B_FALSE); if (err != 0) return (err); /* Is there a limit which we've hit? */ if (enforce == ENFORCE_ALWAYS && (count + delta) > limit) return (SET_ERROR(EDQUOT)); if (dd->dd_parent != NULL) err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop, ancestor, cr, proc); return (err); } /* * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all * parents. When a new filesystem/snapshot is created, increment the count on * all parents, and when a filesystem/snapshot is destroyed, decrement the * count. */ void dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop, dmu_tx_t *tx) { int err; objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t count; ASSERT(dsl_pool_config_held(dd->dd_pool)); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 || strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0); /* * We don't do accounting for hidden ($FREE, $MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$' && strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0) { return; } /* * e.g. if renaming a dataset with no snapshots, count adjustment is 0 */ if (delta == 0) return; /* * If we hit an uninitialized node while recursing up the tree, we can * stop since we know the counts are not valid on this node and we * know we shouldn't touch this node's counts. An uninitialized count * on the node indicates that either the feature has not yet been * activated or there are no limits on this part of the tree. */ if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object, prop, sizeof (count), 1, &count)) == ENOENT) return; VERIFY0(err); count += delta; /* Use a signed verify to make sure we're not neg. */ VERIFY3S(count, >=, 0); VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count, tx)); /* Roll up this additional count into our ancestors */ if (dd->dd_parent != NULL) dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx); } uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, dmu_tx_t *tx) { objset_t *mos = dp->dp_meta_objset; uint64_t ddobj; dsl_dir_phys_t *ddphys; dmu_buf_t *dbuf; ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); if (pds) { VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj, name, sizeof (uint64_t), 1, &ddobj, tx)); } else { /* it's the root dir */ VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); } VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); ddphys = dbuf->db_data; ddphys->dd_creation_time = gethrestime_sec(); if (pds) { ddphys->dd_parent_obj = pds->dd_object; /* update the filesystem counts */ dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx); } ddphys->dd_props_zapobj = zap_create(mos, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); ddphys->dd_child_dir_zapobj = zap_create(mos, DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; dmu_buf_rele(dbuf, FTAG); return (ddobj); } boolean_t dsl_dir_is_clone(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_origin_obj && (dd->dd_pool->dp_origin_snap == NULL || dsl_dir_phys(dd)->dd_origin_obj != dd->dd_pool->dp_origin_snap->ds_object)); } uint64_t dsl_dir_get_used(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_used_bytes); } uint64_t dsl_dir_get_compressed(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_compressed_bytes); } uint64_t dsl_dir_get_quota(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_quota); } uint64_t dsl_dir_get_reservation(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_reserved); } uint64_t dsl_dir_get_compressratio(dsl_dir_t *dd) { /* a fixed point number, 100x the ratio */ return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 : (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 / dsl_dir_phys(dd)->dd_compressed_bytes)); } uint64_t dsl_dir_get_logicalused(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_uncompressed_bytes); } uint64_t dsl_dir_get_usedsnap(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]); } uint64_t dsl_dir_get_usedds(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]); } uint64_t dsl_dir_get_usedrefreserv(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]); } uint64_t dsl_dir_get_usedchild(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]); } void dsl_dir_get_origin(dsl_dir_t *dd, char *buf) { dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds)); dsl_dataset_name(ds, buf); dsl_dataset_rele(ds, FTAG); } int dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count) { if (dsl_dir_is_zapified(dd)) { objset_t *os = dd->dd_pool->dp_meta_objset; return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (*count), 1, count)); } else { return (SET_ERROR(ENOENT)); } } int dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count) { if (dsl_dir_is_zapified(dd)) { objset_t *os = dd->dd_pool->dp_meta_objset; return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (*count), 1, count)); } else { return (SET_ERROR(ENOENT)); } } void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) { mutex_enter(&dd->dd_lock); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dsl_dir_get_quota(dd)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, dsl_dir_get_reservation(dd)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, dsl_dir_get_logicalused(dd)); if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, dsl_dir_get_usedsnap(dd)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, dsl_dir_get_usedds(dd)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, dsl_dir_get_usedrefreserv(dd)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, dsl_dir_get_usedchild(dd)); } mutex_exit(&dd->dd_lock); uint64_t count; if (dsl_dir_get_filesystem_count(dd, &count) == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT, count); } if (dsl_dir_get_snapshot_count(dd, &count) == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT, count); } if (dsl_dir_is_clone(dd)) { char buf[ZFS_MAX_DATASET_NAME_LEN]; dsl_dir_get_origin(dd, buf); dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); } } void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) { dsl_pool_t *dp = dd->dd_pool; ASSERT(dsl_dir_phys(dd)); if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(dd->dd_dbuf, dd); } } static int64_t parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) { uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved); uint64_t new_accounted = MAX(used + delta, dsl_dir_phys(dd)->dd_reserved); return (new_accounted - old_accounted); } void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); mutex_enter(&dd->dd_lock); ASSERT0(dd->dd_tempreserved[tx->tx_txg & TXG_MASK]); dprintf_dd(dd, "txg=%llu towrite=%lluK\n", (u_longlong_t)tx->tx_txg, (u_longlong_t)dd->dd_space_towrite[tx->tx_txg & TXG_MASK] / 1024); dd->dd_space_towrite[tx->tx_txg & TXG_MASK] = 0; mutex_exit(&dd->dd_lock); /* release the hold from dsl_dir_dirty */ dmu_buf_rele(dd->dd_dbuf, dd); } static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd) { uint64_t space = 0; ASSERT(MUTEX_HELD(&dd->dd_lock)); for (int i = 0; i < TXG_SIZE; i++) { space += dd->dd_space_towrite[i & TXG_MASK]; ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0); } return (space); } /* * How much space would dd have available if ancestor had delta applied * to it? If ondiskonly is set, we're only interested in what's * on-disk, not estimated pending changes. */ uint64_t dsl_dir_space_available(dsl_dir_t *dd, dsl_dir_t *ancestor, int64_t delta, int ondiskonly) { uint64_t parentspace, myspace, quota, used; /* * If there are no restrictions otherwise, assume we have * unlimited space available. */ quota = UINT64_MAX; parentspace = UINT64_MAX; if (dd->dd_parent != NULL) { parentspace = dsl_dir_space_available(dd->dd_parent, ancestor, delta, ondiskonly); } mutex_enter(&dd->dd_lock); if (dsl_dir_phys(dd)->dd_quota != 0) quota = dsl_dir_phys(dd)->dd_quota; used = dsl_dir_phys(dd)->dd_used_bytes; if (!ondiskonly) used += dsl_dir_space_towrite(dd); if (dd->dd_parent == NULL) { uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, ZFS_SPACE_CHECK_NORMAL); quota = MIN(quota, poolsize); } if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) { /* * We have some space reserved, in addition to what our * parent gave us. */ parentspace += dsl_dir_phys(dd)->dd_reserved - used; } if (dd == ancestor) { ASSERT(delta <= 0); ASSERT(used >= -delta); used += delta; if (parentspace != UINT64_MAX) parentspace -= delta; } if (used > quota) { /* over quota */ myspace = 0; } else { /* * the lesser of the space provided by our parent and * the space left in our quota */ myspace = MIN(parentspace, quota - used); } mutex_exit(&dd->dd_lock); return (myspace); } struct tempreserve { list_node_t tr_node; dsl_dir_t *tr_ds; uint64_t tr_size; }; static int dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, boolean_t ignorequota, list_t *tr_list, dmu_tx_t *tx, boolean_t first) { uint64_t txg; uint64_t quota; struct tempreserve *tr; int retval; uint64_t ref_rsrv; top_of_function: txg = tx->tx_txg; retval = EDQUOT; ref_rsrv = 0; ASSERT3U(txg, !=, 0); ASSERT3S(asize, >, 0); mutex_enter(&dd->dd_lock); /* * Check against the dsl_dir's quota. We don't add in the delta * when checking for over-quota because they get one free hit. */ uint64_t est_inflight = dsl_dir_space_towrite(dd); for (int i = 0; i < TXG_SIZE; i++) est_inflight += dd->dd_tempreserved[i]; uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; /* * On the first iteration, fetch the dataset's used-on-disk and * refreservation values. Also, if checkrefquota is set, test if * allocating this space would exceed the dataset's refquota. */ if (first && tx->tx_objset) { int error; dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; error = dsl_dataset_check_quota(ds, !netfree, asize, est_inflight, &used_on_disk, &ref_rsrv); if (error != 0) { mutex_exit(&dd->dd_lock); DMU_TX_STAT_BUMP(dmu_tx_quota); return (error); } } /* * If this transaction will result in a net free of space, * we want to let it through. */ if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0) quota = UINT64_MAX; else quota = dsl_dir_phys(dd)->dd_quota; /* * Adjust the quota against the actual pool size at the root * minus any outstanding deferred frees. * To ensure that it's possible to remove files from a full * pool without inducing transient overcommits, we throttle * netfree transactions against a quota that is slightly larger, * but still within the pool's allocation slop. In cases where * we're very close to full, this will allow a steady trickle of * removes to get through. */ if (dd->dd_parent == NULL) { uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool, (netfree) ? ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL); if (avail < quota) { quota = avail; retval = SET_ERROR(ENOSPC); } } /* * If they are requesting more space, and our current estimate * is over quota, they get to try again unless the actual * on-disk is over quota and there are no pending changes * or deferred frees (which may free up space for us). */ if (used_on_disk + est_inflight >= quota) { if (est_inflight > 0 || used_on_disk < quota) { retval = SET_ERROR(ERESTART); } else { ASSERT3U(used_on_disk, >=, quota); if (retval == ENOSPC && (used_on_disk - quota) < dsl_pool_deferred_space(dd->dd_pool)) { retval = SET_ERROR(ERESTART); } } dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " "quota=%lluK tr=%lluK err=%d\n", (u_longlong_t)used_on_disk>>10, (u_longlong_t)est_inflight>>10, (u_longlong_t)quota>>10, (u_longlong_t)asize>>10, retval); mutex_exit(&dd->dd_lock); DMU_TX_STAT_BUMP(dmu_tx_quota); return (retval); } /* We need to up our estimated delta before dropping dd_lock */ dd->dd_tempreserved[txg & TXG_MASK] += asize; uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, asize - ref_rsrv); mutex_exit(&dd->dd_lock); tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); tr->tr_ds = dd; tr->tr_size = asize; list_insert_tail(tr_list, tr); /* see if it's OK with our parent */ if (dd->dd_parent != NULL && parent_rsrv != 0) { /* * Recurse on our parent without recursion. This has been * observed to be potentially large stack usage even within * the test suite. Largest seen stack was 7632 bytes on linux. */ dd = dd->dd_parent; asize = parent_rsrv; ignorequota = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); first = B_FALSE; goto top_of_function; } else { return (0); } } /* * Reserve space in this dsl_dir, to be used in this tx's txg. * After the space has been dirtied (and dsl_dir_willuse_space() * has been called), the reservation should be canceled, using * dsl_dir_tempreserve_clear(). */ int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx) { int err; list_t *tr_list; if (asize == 0) { *tr_cookiep = NULL; return (0); } tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); list_create(tr_list, sizeof (struct tempreserve), offsetof(struct tempreserve, tr_node)); ASSERT3S(asize, >, 0); err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg); if (err == 0) { struct tempreserve *tr; tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); tr->tr_size = lsize; list_insert_tail(tr_list, tr); } else { if (err == EAGAIN) { /* * If arc_memory_throttle() detected that pageout * is running and we are low on memory, we delay new * non-pageout transactions to give pageout an * advantage. * * It is unfortunate to be delaying while the caller's * locks are held. */ txg_delay(dd->dd_pool, tx->tx_txg, MSEC2NSEC(10), MSEC2NSEC(10)); err = SET_ERROR(ERESTART); } } if (err == 0) { err = dsl_dir_tempreserve_impl(dd, asize, netfree, B_FALSE, tr_list, tx, B_TRUE); } if (err != 0) dsl_dir_tempreserve_clear(tr_list, tx); else *tr_cookiep = tr_list; return (err); } /* * Clear a temporary reservation that we previously made with * dsl_dir_tempreserve_space(). */ void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) { int txgidx = tx->tx_txg & TXG_MASK; list_t *tr_list = tr_cookie; struct tempreserve *tr; ASSERT3U(tx->tx_txg, !=, 0); if (tr_cookie == NULL) return; while ((tr = list_head(tr_list)) != NULL) { if (tr->tr_ds) { mutex_enter(&tr->tr_ds->dd_lock); ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, tr->tr_size); tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; mutex_exit(&tr->tr_ds->dd_lock); } else { arc_tempreserve_clear(tr->tr_size); } list_remove(tr_list, tr); kmem_free(tr, sizeof (struct tempreserve)); } kmem_free(tr_list, sizeof (list_t)); } /* * This should be called from open context when we think we're going to write * or free space, for example when dirtying data. Be conservative; it's okay * to write less space or free more, but we don't want to write more or free * less than the amount specified. * * NOTE: The behavior of this function is identical to the Illumos / FreeBSD * version however it has been adjusted to use an iterative rather than * recursive algorithm to minimize stack usage. */ void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) { int64_t parent_space; uint64_t est_used; do { mutex_enter(&dd->dd_lock); if (space > 0) dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes; parent_space = parent_delta(dd, est_used, space); mutex_exit(&dd->dd_lock); /* Make sure that we clean up dd_space_to* */ dsl_dir_dirty(dd, tx); dd = dd->dd_parent; space = parent_space; } while (space && dd); } /* call from syncing context when we actually write/free space for this dd */ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) { int64_t accounted_delta; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(type < DD_USED_NUM); dmu_buf_will_dirty(dd->dd_dbuf, tx); /* * dsl_dataset_set_refreservation_sync_impl() calls this with * dd_lock held, so that it can atomically update * ds->ds_reserved and the dsl_dir accounting, so that * dsl_dataset_check_quota() can see dataset and dir accounting * consistently. */ boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); if (needlock) mutex_enter(&dd->dd_lock); dsl_dir_phys_t *ddp = dsl_dir_phys(dd); accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used); ASSERT(used >= 0 || ddp->dd_used_bytes >= -used); ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed); ASSERT(uncompressed >= 0 || ddp->dd_uncompressed_bytes >= -uncompressed); ddp->dd_used_bytes += used; ddp->dd_uncompressed_bytes += uncompressed; ddp->dd_compressed_bytes += compressed; if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) { ASSERT(used >= 0 || ddp->dd_used_breakdown[type] >= -used); ddp->dd_used_breakdown[type] += used; #ifdef ZFS_DEBUG { dd_used_t t; uint64_t u = 0; for (t = 0; t < DD_USED_NUM; t++) u += ddp->dd_used_breakdown[t]; ASSERT3U(u, ==, ddp->dd_used_bytes); } #endif } if (needlock) mutex_exit(&dd->dd_lock); if (dd->dd_parent != NULL) { dsl_dir_diduse_transfer_space(dd->dd_parent, accounted_delta, compressed, uncompressed, used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); } } void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(oldtype < DD_USED_NUM); ASSERT(newtype < DD_USED_NUM); dsl_dir_phys_t *ddp = dsl_dir_phys(dd); if (delta == 0 || !(ddp->dd_flags & DD_FLAG_USED_BREAKDOWN)) return; dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); ASSERT(delta > 0 ? ddp->dd_used_breakdown[oldtype] >= delta : ddp->dd_used_breakdown[newtype] >= -delta); ASSERT(ddp->dd_used_bytes >= ABS(delta)); ddp->dd_used_breakdown[oldtype] -= delta; ddp->dd_used_breakdown[newtype] += delta; mutex_exit(&dd->dd_lock); } void dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used, int64_t compressed, int64_t uncompressed, int64_t tonew, dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) { int64_t accounted_delta; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(oldtype < DD_USED_NUM); ASSERT(newtype < DD_USED_NUM); dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); dsl_dir_phys_t *ddp = dsl_dir_phys(dd); accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used); ASSERT(used >= 0 || ddp->dd_used_bytes >= -used); ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed); ASSERT(uncompressed >= 0 || ddp->dd_uncompressed_bytes >= -uncompressed); ddp->dd_used_bytes += used; ddp->dd_uncompressed_bytes += uncompressed; ddp->dd_compressed_bytes += compressed; if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) { ASSERT(tonew - used <= 0 || ddp->dd_used_breakdown[oldtype] >= tonew - used); ASSERT(tonew >= 0 || ddp->dd_used_breakdown[newtype] >= -tonew); ddp->dd_used_breakdown[oldtype] -= tonew - used; ddp->dd_used_breakdown[newtype] += tonew; #ifdef ZFS_DEBUG { dd_used_t t; uint64_t u = 0; for (t = 0; t < DD_USED_NUM; t++) u += ddp->dd_used_breakdown[t]; ASSERT3U(u, ==, ddp->dd_used_bytes); } #endif } mutex_exit(&dd->dd_lock); if (dd->dd_parent != NULL) { dsl_dir_diduse_transfer_space(dd->dd_parent, accounted_delta, compressed, uncompressed, used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); } } typedef struct dsl_dir_set_qr_arg { const char *ddsqra_name; zprop_source_t ddsqra_source; uint64_t ddsqra_value; } dsl_dir_set_qr_arg_t; static int dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; uint64_t towrite, newval; error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); if (error != 0) return (error); error = dsl_prop_predict(ds->ds_dir, "quota", ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (newval == 0) { dsl_dataset_rele(ds, FTAG); return (0); } mutex_enter(&ds->ds_dir->dd_lock); /* * If we are doing the preliminary check in open context, and * there are pending changes, then don't fail it, since the * pending changes could under-estimate the amount of space to be * freed up. */ towrite = dsl_dir_space_towrite(ds->ds_dir); if ((dmu_tx_is_syncing(tx) || towrite == 0) && (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved || newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) { error = SET_ERROR(ENOSPC); } mutex_exit(&ds->ds_dir->dd_lock); dsl_dataset_rele(ds, FTAG); return (error); } static void dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; uint64_t newval; VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, &ddsqra->ddsqra_value, tx); VERIFY0(dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); } else { newval = ddsqra->ddsqra_value; spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval); } dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); dsl_dir_phys(ds->ds_dir)->dd_quota = newval; mutex_exit(&ds->ds_dir->dd_lock); dsl_dataset_rele(ds, FTAG); } int dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) { dsl_dir_set_qr_arg_t ddsqra; ddsqra.ddsqra_name = ddname; ddsqra.ddsqra_source = source; ddsqra.ddsqra_value = quota; return (dsl_sync_task(ddname, dsl_dir_set_quota_check, dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } static int dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; dsl_dir_t *dd; uint64_t newval, used, avail; int error; error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); if (error != 0) return (error); dd = ds->ds_dir; /* * If we are doing the preliminary check in open context, the * space estimates may be inaccurate. */ if (!dmu_tx_is_syncing(tx)) { dsl_dataset_rele(ds, FTAG); return (0); } error = dsl_prop_predict(ds->ds_dir, zfs_prop_to_name(ZFS_PROP_RESERVATION), ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } mutex_enter(&dd->dd_lock); used = dsl_dir_phys(dd)->dd_used_bytes; mutex_exit(&dd->dd_lock); if (dd->dd_parent) { avail = dsl_dir_space_available(dd->dd_parent, NULL, 0, FALSE); } else { avail = dsl_pool_adjustedsize(dd->dd_pool, ZFS_SPACE_CHECK_NORMAL) - used; } if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) { uint64_t delta = MAX(used, newval) - MAX(used, dsl_dir_phys(dd)->dd_reserved); if (delta > avail || (dsl_dir_phys(dd)->dd_quota > 0 && newval > dsl_dir_phys(dd)->dd_quota)) error = SET_ERROR(ENOSPC); } dsl_dataset_rele(ds, FTAG); return (error); } void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) { uint64_t used; int64_t delta; dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); used = dsl_dir_phys(dd)->dd_used_bytes; delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved); dsl_dir_phys(dd)->dd_reserved = value; if (dd->dd_parent != NULL) { /* Roll up this additional usage into our ancestors */ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, delta, 0, 0, tx); } mutex_exit(&dd->dd_lock); } static void dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; uint64_t newval; VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_RESERVATION), ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, &ddsqra->ddsqra_value, tx); VERIFY0(dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); } else { newval = ddsqra->ddsqra_value; spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", zfs_prop_to_name(ZFS_PROP_RESERVATION), (longlong_t)newval); } dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); dsl_dataset_rele(ds, FTAG); } int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, uint64_t reservation) { dsl_dir_set_qr_arg_t ddsqra; ddsqra.ddsqra_name = ddname; ddsqra.ddsqra_source = source; ddsqra.ddsqra_value = reservation; return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } static dsl_dir_t * closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) { for (; ds1; ds1 = ds1->dd_parent) { dsl_dir_t *dd; for (dd = ds2; dd; dd = dd->dd_parent) { if (ds1 == dd) return (dd); } } return (NULL); } /* * If delta is applied to dd, how much of that delta would be applied to * ancestor? Syncing context only. */ static int64_t would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) { if (dd == ancestor) return (delta); mutex_enter(&dd->dd_lock); delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta); mutex_exit(&dd->dd_lock); return (would_change(dd->dd_parent, delta, ancestor)); } typedef struct dsl_dir_rename_arg { const char *ddra_oldname; const char *ddra_newname; cred_t *ddra_cred; proc_t *ddra_proc; } dsl_dir_rename_arg_t; typedef struct dsl_valid_rename_arg { int char_delta; int nest_delta; } dsl_valid_rename_arg_t; static int dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { (void) dp; dsl_valid_rename_arg_t *dvra = arg; char namebuf[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds, namebuf); ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); int namelen = strlen(namebuf) + dvra->char_delta; int depth = get_dataset_depth(namebuf) + dvra->nest_delta; if (namelen >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int dsl_dir_rename_check(void *arg, dmu_tx_t *tx) { dsl_dir_rename_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd, *newparent; dsl_valid_rename_arg_t dvra; dsl_dataset_t *parentds; objset_t *parentos; const char *mynewname; int error; /* target dir should exist */ error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); if (error != 0) return (error); /* new parent should exist */ error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, &mynewname); if (error != 0) { dsl_dir_rele(dd, FTAG); return (error); } /* can't rename to different pool */ if (dd->dd_pool != newparent->dd_pool) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EXDEV)); } /* new name should not already exist */ if (mynewname == NULL) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EEXIST)); } /* can't rename below anything but filesystems (eg. no ZVOLs) */ error = dsl_dataset_hold_obj(newparent->dd_pool, dsl_dir_phys(newparent)->dd_head_dataset_obj, FTAG, &parentds); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (error); } error = dmu_objset_from_ds(parentds, &parentos); if (error != 0) { dsl_dataset_rele(parentds, FTAG); dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (error); } if (dmu_objset_type(parentos) != DMU_OST_ZFS) { dsl_dataset_rele(parentds, FTAG); dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); } dsl_dataset_rele(parentds, FTAG); ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); dvra.char_delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname); dvra.nest_delta = get_dataset_depth(ddra->ddra_newname) - get_dataset_depth(ddra->ddra_oldname); /* if the name length is growing, validate child name lengths */ if (dvra.char_delta > 0 || dvra.nest_delta > 0) { error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (error); } } if (dmu_tx_is_syncing(tx)) { if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { /* * Although this is the check function and we don't * normally make on-disk changes in check functions, * we need to do that here. * * Ensure this portion of the tree's counts have been * initialized in case the new parent has limits set. */ dsl_dir_init_fs_ss_count(dd, tx); } } if (newparent != dd->dd_parent) { /* is there enough space? */ uint64_t myspace = MAX(dsl_dir_phys(dd)->dd_used_bytes, dsl_dir_phys(dd)->dd_reserved); objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t fs_cnt = 0; uint64_t ss_cnt = 0; if (dsl_dir_is_zapified(dd)) { int err; err = zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, &fs_cnt); if (err != ENOENT && err != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (err); } /* * have to add 1 for the filesystem itself that we're * moving */ fs_cnt++; err = zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, &ss_cnt); if (err != ENOENT && err != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (err); } } /* check for encryption errors */ error = dsl_dir_rename_crypt_check(dd, newparent); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EACCES)); } /* no rename into our descendant */ if (closest_common_ancestor(dd, newparent) == dd) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_dir_transfer_possible(dd->dd_parent, newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred, ddra->ddra_proc); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (error); } } dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (0); } static void dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) { dsl_dir_rename_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd, *newparent; const char *mynewname; objset_t *mos = dp->dp_meta_objset; VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, &mynewname)); /* Log this before we change the name. */ spa_history_log_internal_dd(dd, "rename", tx, "-> %s", ddra->ddra_newname); if (newparent != dd->dd_parent) { objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t fs_cnt = 0; uint64_t ss_cnt = 0; /* * We already made sure the dd counts were initialized in the * check function. */ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { VERIFY0(zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, &fs_cnt)); /* add 1 for the filesystem itself that we're moving */ fs_cnt++; VERIFY0(zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, &ss_cnt)); } dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt, DD_FIELD_FILESYSTEM_COUNT, tx); dsl_fs_ss_count_adjust(newparent, fs_cnt, DD_FIELD_FILESYSTEM_COUNT, tx); dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt, DD_FIELD_SNAPSHOT_COUNT, tx); dsl_fs_ss_count_adjust(newparent, ss_cnt, DD_FIELD_SNAPSHOT_COUNT, tx); dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, -dsl_dir_phys(dd)->dd_used_bytes, -dsl_dir_phys(dd)->dd_compressed_bytes, -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); dsl_dir_diduse_space(newparent, DD_USED_CHILD, dsl_dir_phys(dd)->dd_used_bytes, dsl_dir_phys(dd)->dd_compressed_bytes, dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); if (dsl_dir_phys(dd)->dd_reserved > dsl_dir_phys(dd)->dd_used_bytes) { uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved - dsl_dir_phys(dd)->dd_used_bytes; dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, -unused_rsrv, 0, 0, tx); dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, unused_rsrv, 0, 0, tx); } } dmu_buf_will_dirty(dd->dd_dbuf, tx); /* remove from old parent zapobj */ VERIFY0(zap_remove(mos, dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, dd->dd_myname, tx)); (void) strlcpy(dd->dd_myname, mynewname, sizeof (dd->dd_myname)); dsl_dir_rele(dd->dd_parent, dd); dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object; VERIFY0(dsl_dir_hold_obj(dp, newparent->dd_object, NULL, dd, &dd->dd_parent)); /* add to new parent zapobj */ VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj, dd->dd_myname, 8, 1, &dd->dd_object, tx)); /* TODO: A rename callback to avoid these layering violations. */ zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname); zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname, ddra->ddra_newname, B_TRUE); dsl_prop_notify_all(dd); dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); } int dsl_dir_rename(const char *oldname, const char *newname) { dsl_dir_rename_arg_t ddra; ddra.ddra_oldname = oldname; ddra.ddra_newname = newname; ddra.ddra_cred = CRED(); ddra.ddra_proc = curproc; return (dsl_sync_task(oldname, dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3, ZFS_SPACE_CHECK_RESERVED)); } int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr, proc_t *proc) { dsl_dir_t *ancestor; int64_t adelta; uint64_t avail; int err; ancestor = closest_common_ancestor(sdd, tdd); adelta = would_change(sdd, -space, ancestor); avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); if (avail < space) return (SET_ERROR(ENOSPC)); err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT, ancestor, cr, proc); if (err != 0) return (err); err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT, ancestor, cr, proc); if (err != 0) return (err); return (0); } inode_timespec_t dsl_dir_snap_cmtime(dsl_dir_t *dd) { inode_timespec_t t; mutex_enter(&dd->dd_lock); t = dd->dd_snap_cmtime; mutex_exit(&dd->dd_lock); return (t); } void dsl_dir_snap_cmtime_update(dsl_dir_t *dd, dmu_tx_t *tx) { dsl_pool_t *dp = dmu_tx_pool(tx); inode_timespec_t t; gethrestime(&t); mutex_enter(&dd->dd_lock); dd->dd_snap_cmtime = t; if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) { objset_t *mos = dd->dd_pool->dp_meta_objset; uint64_t ddobj = dd->dd_object; dsl_dir_zapify(dd, tx); VERIFY0(zap_update(mos, ddobj, DD_FIELD_SNAPSHOTS_CHANGED, sizeof (uint64_t), sizeof (inode_timespec_t) / sizeof (uint64_t), &t, tx)); } mutex_exit(&dd->dd_lock); } void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx) { objset_t *mos = dd->dd_pool->dp_meta_objset; dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx); } boolean_t dsl_dir_is_zapified(dsl_dir_t *dd) { dmu_object_info_t doi; dmu_object_info_from_db(dd->dd_dbuf, &doi); return (doi.doi_type == DMU_OTN_ZAP_METADATA); } void dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj) { objset_t *mos = dd->dd_pool->dp_meta_objset; ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa, SPA_FEATURE_LIVELIST)); dsl_deadlist_open(&dd->dd_livelist, mos, obj); bplist_create(&dd->dd_pending_allocs); bplist_create(&dd->dd_pending_frees); } void dsl_dir_livelist_close(dsl_dir_t *dd) { dsl_deadlist_close(&dd->dd_livelist); bplist_destroy(&dd->dd_pending_allocs); bplist_destroy(&dd->dd_pending_frees); } void dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total) { uint64_t obj; dsl_pool_t *dp = dmu_tx_pool(tx); spa_t *spa = dp->dp_spa; livelist_condense_entry_t to_condense = spa->spa_to_condense; if (!dsl_deadlist_is_open(&dd->dd_livelist)) return; /* * If the livelist being removed is set to be condensed, stop the * condense zthr and indicate the cancellation in the spa_to_condense * struct in case the condense no-wait synctask has already started */ zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; if (ll_condense_thread != NULL && (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) { /* * We use zthr_wait_cycle_done instead of zthr_cancel * because we don't want to destroy the zthr, just have * it skip its current task. */ spa->spa_to_condense.cancelled = B_TRUE; zthr_wait_cycle_done(ll_condense_thread); /* * If we've returned from zthr_wait_cycle_done without * clearing the to_condense data structure it's either * because the no-wait synctask has started (which is * indicated by 'syncing' field of to_condense) and we * can expect it to clear to_condense on its own. * Otherwise, we returned before the zthr ran. The * checkfunc will now fail as cancelled == B_TRUE so we * can safely NULL out ds, allowing a different dir's * livelist to be condensed. * * We can be sure that the to_condense struct will not * be repopulated at this stage because both this * function and dsl_livelist_try_condense execute in * syncing context. */ if ((spa->spa_to_condense.ds != NULL) && !spa->spa_to_condense.syncing) { dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); spa->spa_to_condense.ds = NULL; } } dsl_dir_livelist_close(dd); VERIFY0(zap_lookup(dp->dp_meta_objset, dd->dd_object, DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj)); VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object, DD_FIELD_LIVELIST, tx)); if (total) { dsl_deadlist_free(dp->dp_meta_objset, obj, tx); spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); } } static int dsl_dir_activity_in_progress(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity, boolean_t *in_progress) { int error = 0; ASSERT(MUTEX_HELD(&dd->dd_activity_lock)); switch (activity) { case ZFS_WAIT_DELETEQ: { #ifdef _KERNEL objset_t *os; error = dmu_objset_from_ds(ds, &os); if (error != 0) break; mutex_enter(&os->os_user_ptr_lock); void *user = dmu_objset_get_user(os); mutex_exit(&os->os_user_ptr_lock); if (dmu_objset_type(os) != DMU_OST_ZFS || user == NULL || zfs_get_vfs_flag_unmounted(os)) { *in_progress = B_FALSE; return (0); } uint64_t readonly = B_FALSE; error = zfs_get_temporary_prop(ds, ZFS_PROP_READONLY, &readonly, NULL); if (error != 0) break; if (readonly || !spa_writeable(dd->dd_pool->dp_spa)) { *in_progress = B_FALSE; return (0); } uint64_t count, unlinked_obj; error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &unlinked_obj); if (error != 0) { dsl_dataset_rele(ds, FTAG); break; } error = zap_count(os, unlinked_obj, &count); if (error == 0) *in_progress = (count != 0); break; #else /* * The delete queue is ZPL specific, and libzpool doesn't have * it. It doesn't make sense to wait for it. */ (void) ds; *in_progress = B_FALSE; break; #endif } default: panic("unrecognized value for activity %d", activity); } return (error); } int dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity, boolean_t *waited) { int error = 0; boolean_t in_progress; dsl_pool_t *dp = dd->dd_pool; for (;;) { dsl_pool_config_enter(dp, FTAG); error = dsl_dir_activity_in_progress(dd, ds, activity, &in_progress); dsl_pool_config_exit(dp, FTAG); if (error != 0 || !in_progress) break; *waited = B_TRUE; if (cv_wait_sig(&dd->dd_activity_cv, &dd->dd_activity_lock) == 0 || dd->dd_activity_cancelled) { error = SET_ERROR(EINTR); break; } } return (error); } void dsl_dir_cancel_waiters(dsl_dir_t *dd) { mutex_enter(&dd->dd_activity_lock); dd->dd_activity_cancelled = B_TRUE; cv_broadcast(&dd->dd_activity_cv); while (dd->dd_activity_waiters > 0) cv_wait(&dd->dd_activity_cv, &dd->dd_activity_lock); mutex_exit(&dd->dd_activity_lock); } #if defined(_KERNEL) EXPORT_SYMBOL(dsl_dir_set_quota); EXPORT_SYMBOL(dsl_dir_set_reservation); #endif diff --git a/module/zfs/dsl_prop.c b/module/zfs/dsl_prop.c index 1d3d26124949..610e887b3fba 100644 --- a/module/zfs/dsl_prop.c +++ b/module/zfs/dsl_prop.c @@ -1,1287 +1,1287 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 Martin Matuska. All rights reserved. * Copyright 2019 Joyent, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #define ZPROP_INHERIT_SUFFIX "$inherit" #define ZPROP_RECVD_SUFFIX "$recvd" static int dodefault(zfs_prop_t prop, int intsz, int numints, void *buf) { /* * The setonce properties are read-only, BUT they still * have a default value that can be used as the initial * value. */ if (prop == ZPROP_INVAL || (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop))) return (SET_ERROR(ENOENT)); if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { if (intsz != 1) return (SET_ERROR(EOVERFLOW)); - (void) strncpy(buf, zfs_prop_default_string(prop), + (void) strlcpy(buf, zfs_prop_default_string(prop), numints); } else { if (intsz != 8 || numints < 1) return (SET_ERROR(EOVERFLOW)); *(uint64_t *)buf = zfs_prop_default_numeric(prop); } return (0); } int dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot) { int err; dsl_dir_t *target = dd; objset_t *mos = dd->dd_pool->dp_meta_objset; zfs_prop_t prop; boolean_t inheritable; boolean_t inheriting = B_FALSE; char *inheritstr; char *recvdstr; ASSERT(dsl_pool_config_held(dd->dd_pool)); if (setpoint) setpoint[0] = '\0'; prop = zfs_name_to_prop(propname); inheritable = (prop == ZPROP_USERPROP || zfs_prop_inheritable(prop)); inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); /* * Note: dd may become NULL, therefore we shouldn't dereference it * after this loop. */ for (; dd != NULL; dd = dd->dd_parent) { if (dd != target || snapshot) { if (!inheritable) { err = SET_ERROR(ENOENT); break; } inheriting = B_TRUE; } /* Check for a local value. */ err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj, propname, intsz, numints, buf); if (err != ENOENT) { if (setpoint != NULL && err == 0) dsl_dir_name(dd, setpoint); break; } /* * Skip the check for a received value if there is an explicit * inheritance entry. */ err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj, inheritstr); if (err != 0 && err != ENOENT) break; if (err == ENOENT) { /* Check for a received value. */ err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj, recvdstr, intsz, numints, buf); if (err != ENOENT) { if (setpoint != NULL && err == 0) { if (inheriting) { dsl_dir_name(dd, setpoint); } else { (void) strlcpy(setpoint, ZPROP_SOURCE_VAL_RECVD, MAXNAMELEN); } } break; } } /* * If we found an explicit inheritance entry, err is zero even * though we haven't yet found the value, so reinitializing err * at the end of the loop (instead of at the beginning) ensures * that err has a valid post-loop value. */ err = SET_ERROR(ENOENT); } if (err == ENOENT) err = dodefault(prop, intsz, numints, buf); kmem_strfree(inheritstr); kmem_strfree(recvdstr); return (err); } int dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, int intsz, int numints, void *buf, char *setpoint) { zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t inheritable; uint64_t zapobj; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); inheritable = (prop == ZPROP_USERPROP || zfs_prop_inheritable(prop)); zapobj = dsl_dataset_phys(ds)->ds_props_obj; if (zapobj != 0) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; int err; ASSERT(ds->ds_is_snapshot); /* Check for a local value. */ err = zap_lookup(mos, zapobj, propname, intsz, numints, buf); if (err != ENOENT) { if (setpoint != NULL && err == 0) dsl_dataset_name(ds, setpoint); return (err); } /* * Skip the check for a received value if there is an explicit * inheritance entry. */ if (inheritable) { char *inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); err = zap_contains(mos, zapobj, inheritstr); kmem_strfree(inheritstr); if (err != 0 && err != ENOENT) return (err); } if (err == ENOENT) { /* Check for a received value. */ char *recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); err = zap_lookup(mos, zapobj, recvdstr, intsz, numints, buf); kmem_strfree(recvdstr); if (err != ENOENT) { if (setpoint != NULL && err == 0) (void) strlcpy(setpoint, ZPROP_SOURCE_VAL_RECVD, MAXNAMELEN); return (err); } } } return (dsl_prop_get_dd(ds->ds_dir, propname, intsz, numints, buf, setpoint, ds->ds_is_snapshot)); } static dsl_prop_record_t * dsl_prop_record_find(dsl_dir_t *dd, const char *propname) { dsl_prop_record_t *pr = NULL; ASSERT(MUTEX_HELD(&dd->dd_lock)); for (pr = list_head(&dd->dd_props); pr != NULL; pr = list_next(&dd->dd_props, pr)) { if (strcmp(pr->pr_propname, propname) == 0) break; } return (pr); } static dsl_prop_record_t * dsl_prop_record_create(dsl_dir_t *dd, const char *propname) { dsl_prop_record_t *pr; ASSERT(MUTEX_HELD(&dd->dd_lock)); pr = kmem_alloc(sizeof (dsl_prop_record_t), KM_SLEEP); pr->pr_propname = spa_strdup(propname); list_create(&pr->pr_cbs, sizeof (dsl_prop_cb_record_t), offsetof(dsl_prop_cb_record_t, cbr_pr_node)); list_insert_head(&dd->dd_props, pr); return (pr); } void dsl_prop_init(dsl_dir_t *dd) { list_create(&dd->dd_props, sizeof (dsl_prop_record_t), offsetof(dsl_prop_record_t, pr_node)); } void dsl_prop_fini(dsl_dir_t *dd) { dsl_prop_record_t *pr; while ((pr = list_remove_head(&dd->dd_props)) != NULL) { list_destroy(&pr->pr_cbs); spa_strfree((char *)pr->pr_propname); kmem_free(pr, sizeof (dsl_prop_record_t)); } list_destroy(&dd->dd_props); } /* * Register interest in the named property. We'll call the callback * once to notify it of the current property value, and again each time * the property changes, until this callback is unregistered. * * Return 0 on success, errno if the prop is not an integer value. */ int dsl_prop_register(dsl_dataset_t *ds, const char *propname, dsl_prop_changed_cb_t *callback, void *cbarg) { dsl_dir_t *dd = ds->ds_dir; uint64_t value; dsl_prop_record_t *pr; dsl_prop_cb_record_t *cbr; int err; dsl_pool_t *dp __maybe_unused = dd->dd_pool; ASSERT(dsl_pool_config_held(dp)); err = dsl_prop_get_int_ds(ds, propname, &value); if (err != 0) return (err); cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP); cbr->cbr_ds = ds; cbr->cbr_func = callback; cbr->cbr_arg = cbarg; mutex_enter(&dd->dd_lock); pr = dsl_prop_record_find(dd, propname); if (pr == NULL) pr = dsl_prop_record_create(dd, propname); cbr->cbr_pr = pr; list_insert_head(&pr->pr_cbs, cbr); list_insert_head(&ds->ds_prop_cbs, cbr); mutex_exit(&dd->dd_lock); cbr->cbr_func(cbr->cbr_arg, value); return (0); } int dsl_prop_get(const char *dsname, const char *propname, int intsz, int numints, void *buf, char *setpoint) { objset_t *os; int error; error = dmu_objset_hold(dsname, FTAG, &os); if (error != 0) return (error); error = dsl_prop_get_ds(dmu_objset_ds(os), propname, intsz, numints, buf, setpoint); dmu_objset_rele(os, FTAG); return (error); } /* * Get the current property value. It may have changed by the time this * function returns, so it is NOT safe to follow up with * dsl_prop_register() and assume that the value has not changed in * between. * * Return 0 on success, ENOENT if ddname is invalid. */ int dsl_prop_get_integer(const char *ddname, const char *propname, uint64_t *valuep, char *setpoint) { return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint)); } int dsl_prop_get_int_ds(dsl_dataset_t *ds, const char *propname, uint64_t *valuep) { return (dsl_prop_get_ds(ds, propname, 8, 1, valuep, NULL)); } /* * Predict the effective value of the given special property if it were set with * the given value and source. This is not a general purpose function. It exists * only to handle the special requirements of the quota and reservation * properties. The fact that these properties are non-inheritable greatly * simplifies the prediction logic. * * Returns 0 on success, a positive error code on failure, or -1 if called with * a property not handled by this function. */ int dsl_prop_predict(dsl_dir_t *dd, const char *propname, zprop_source_t source, uint64_t value, uint64_t *newvalp) { zfs_prop_t prop = zfs_name_to_prop(propname); objset_t *mos; uint64_t zapobj; uint64_t version; char *recvdstr; int err = 0; switch (prop) { case ZFS_PROP_QUOTA: case ZFS_PROP_RESERVATION: case ZFS_PROP_REFQUOTA: case ZFS_PROP_REFRESERVATION: break; default: return (-1); } mos = dd->dd_pool->dp_meta_objset; zapobj = dsl_dir_phys(dd)->dd_props_zapobj; recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); version = spa_version(dd->dd_pool->dp_spa); if (version < SPA_VERSION_RECVD_PROPS) { if (source & ZPROP_SRC_NONE) source = ZPROP_SRC_NONE; else if (source & ZPROP_SRC_RECEIVED) source = ZPROP_SRC_LOCAL; } switch ((int)source) { case ZPROP_SRC_NONE: /* Revert to the received value, if any. */ err = zap_lookup(mos, zapobj, recvdstr, 8, 1, newvalp); if (err == ENOENT) *newvalp = 0; break; case ZPROP_SRC_LOCAL: *newvalp = value; break; case ZPROP_SRC_RECEIVED: /* * If there's no local setting, then the new received value will * be the effective value. */ err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); if (err == ENOENT) *newvalp = value; break; case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): /* * We're clearing the received value, so the local setting (if * it exists) remains the effective value. */ err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); if (err == ENOENT) *newvalp = 0; break; default: panic("unexpected property source: %d", source); } kmem_strfree(recvdstr); if (err == ENOENT) return (0); return (err); } /* * Unregister this callback. Return 0 on success, ENOENT if ddname is * invalid, or ENOMSG if no matching callback registered. * * NOTE: This function is no longer used internally but has been preserved * to prevent breaking external consumers (Lustre, etc). */ int dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, dsl_prop_changed_cb_t *callback, void *cbarg) { dsl_dir_t *dd = ds->ds_dir; dsl_prop_cb_record_t *cbr; mutex_enter(&dd->dd_lock); for (cbr = list_head(&ds->ds_prop_cbs); cbr; cbr = list_next(&ds->ds_prop_cbs, cbr)) { if (cbr->cbr_ds == ds && cbr->cbr_func == callback && cbr->cbr_arg == cbarg && strcmp(cbr->cbr_pr->pr_propname, propname) == 0) break; } if (cbr == NULL) { mutex_exit(&dd->dd_lock); return (SET_ERROR(ENOMSG)); } list_remove(&ds->ds_prop_cbs, cbr); list_remove(&cbr->cbr_pr->pr_cbs, cbr); mutex_exit(&dd->dd_lock); kmem_free(cbr, sizeof (dsl_prop_cb_record_t)); return (0); } /* * Unregister all callbacks that are registered with the * given callback argument. */ void dsl_prop_unregister_all(dsl_dataset_t *ds, void *cbarg) { dsl_prop_cb_record_t *cbr, *next_cbr; dsl_dir_t *dd = ds->ds_dir; mutex_enter(&dd->dd_lock); next_cbr = list_head(&ds->ds_prop_cbs); while (next_cbr != NULL) { cbr = next_cbr; next_cbr = list_next(&ds->ds_prop_cbs, cbr); if (cbr->cbr_arg == cbarg) { list_remove(&ds->ds_prop_cbs, cbr); list_remove(&cbr->cbr_pr->pr_cbs, cbr); kmem_free(cbr, sizeof (dsl_prop_cb_record_t)); } } mutex_exit(&dd->dd_lock); } boolean_t dsl_prop_hascb(dsl_dataset_t *ds) { return (!list_is_empty(&ds->ds_prop_cbs)); } static int dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { (void) arg; dsl_dir_t *dd = ds->ds_dir; dsl_prop_record_t *pr; dsl_prop_cb_record_t *cbr; mutex_enter(&dd->dd_lock); for (pr = list_head(&dd->dd_props); pr; pr = list_next(&dd->dd_props, pr)) { for (cbr = list_head(&pr->pr_cbs); cbr; cbr = list_next(&pr->pr_cbs, cbr)) { uint64_t value; /* * Callback entries do not have holds on their * datasets so that datasets with registered * callbacks are still eligible for eviction. * Unlike operations to update properties on a * single dataset, we are performing a recursive * descent of related head datasets. The caller * of this function only has a dataset hold on * the passed in head dataset, not the snapshots * associated with this dataset. Without a hold, * the dataset pointer within callback records * for snapshots can be invalidated by eviction * at any time. * * Use dsl_dataset_try_add_ref() to verify * that the dataset for a snapshot has not * begun eviction processing and to prevent * eviction from occurring for the duration of * the callback. If the hold attempt fails, * this object is already being evicted and the * callback can be safely ignored. */ if (ds != cbr->cbr_ds && !dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG)) continue; if (dsl_prop_get_ds(cbr->cbr_ds, cbr->cbr_pr->pr_propname, sizeof (value), 1, &value, NULL) == 0) cbr->cbr_func(cbr->cbr_arg, value); if (ds != cbr->cbr_ds) dsl_dataset_rele(cbr->cbr_ds, FTAG); } } mutex_exit(&dd->dd_lock); return (0); } /* * Update all property values for ddobj & its descendants. This is used * when renaming the dir. */ void dsl_prop_notify_all(dsl_dir_t *dd) { dsl_pool_t *dp = dd->dd_pool; ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); (void) dmu_objset_find_dp(dp, dd->dd_object, dsl_prop_notify_all_cb, NULL, DS_FIND_CHILDREN); } static void dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, const char *propname, uint64_t value, int first) { dsl_dir_t *dd; dsl_prop_record_t *pr; dsl_prop_cb_record_t *cbr; objset_t *mos = dp->dp_meta_objset; zap_cursor_t zc; zap_attribute_t *za; int err; ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); if (err) return; if (!first) { /* * If the prop is set here, then this change is not * being inherited here or below; stop the recursion. */ err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj, propname); if (err == 0) { dsl_dir_rele(dd, FTAG); return; } ASSERT3U(err, ==, ENOENT); } mutex_enter(&dd->dd_lock); pr = dsl_prop_record_find(dd, propname); if (pr != NULL) { for (cbr = list_head(&pr->pr_cbs); cbr; cbr = list_next(&pr->pr_cbs, cbr)) { uint64_t propobj; /* * cbr->cbr_ds may be invalidated due to eviction, * requiring the use of dsl_dataset_try_add_ref(). * See comment block in dsl_prop_notify_all_cb() * for details. */ if (!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG)) continue; propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj; /* * If the property is not set on this ds, then it is * inherited here; call the callback. */ if (propobj == 0 || zap_contains(mos, propobj, propname) != 0) cbr->cbr_func(cbr->cbr_arg, value); dsl_dataset_rele(cbr->cbr_ds, FTAG); } } mutex_exit(&dd->dd_lock); za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); for (zap_cursor_init(&zc, mos, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { dsl_prop_changed_notify(dp, za->za_first_integer, propname, value, FALSE); } kmem_free(za, sizeof (zap_attribute_t)); zap_cursor_fini(&zc); dsl_dir_rele(dd, FTAG); } void dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, zprop_source_t source, int intsz, int numints, const void *value, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t zapobj, intval, dummy, count; int isint; char valbuf[32]; const char *valstr = NULL; char *inheritstr; char *recvdstr; char *tbuf = NULL; int err; uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa); isint = (dodefault(zfs_name_to_prop(propname), 8, 1, &intval) == 0); if (ds->ds_is_snapshot) { ASSERT(version >= SPA_VERSION_SNAP_PROPS); if (dsl_dataset_phys(ds)->ds_props_obj == 0 && (source & ZPROP_SRC_NONE) == 0) { dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_props_obj = zap_create(mos, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); } zapobj = dsl_dataset_phys(ds)->ds_props_obj; } else { zapobj = dsl_dir_phys(ds->ds_dir)->dd_props_zapobj; } /* If we are removing objects from a non-existent ZAP just return */ if (zapobj == 0) return; if (version < SPA_VERSION_RECVD_PROPS) { if (source & ZPROP_SRC_NONE) source = ZPROP_SRC_NONE; else if (source & ZPROP_SRC_RECEIVED) source = ZPROP_SRC_LOCAL; } inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); switch ((int)source) { case ZPROP_SRC_NONE: /* * revert to received value, if any (inherit -S) * - remove propname * - remove propname$inherit */ err = zap_remove(mos, zapobj, propname, tx); ASSERT(err == 0 || err == ENOENT); err = zap_remove(mos, zapobj, inheritstr, tx); ASSERT(err == 0 || err == ENOENT); break; case ZPROP_SRC_LOCAL: /* * remove propname$inherit * set propname -> value */ err = zap_remove(mos, zapobj, inheritstr, tx); ASSERT(err == 0 || err == ENOENT); VERIFY0(zap_update(mos, zapobj, propname, intsz, numints, value, tx)); break; case ZPROP_SRC_INHERITED: /* * explicitly inherit * - remove propname * - set propname$inherit */ err = zap_remove(mos, zapobj, propname, tx); ASSERT(err == 0 || err == ENOENT); if (version >= SPA_VERSION_RECVD_PROPS && dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) { dummy = 0; VERIFY0(zap_update(mos, zapobj, inheritstr, 8, 1, &dummy, tx)); } break; case ZPROP_SRC_RECEIVED: /* * set propname$recvd -> value */ err = zap_update(mos, zapobj, recvdstr, intsz, numints, value, tx); ASSERT(err == 0); break; case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED): /* * clear local and received settings * - remove propname * - remove propname$inherit * - remove propname$recvd */ err = zap_remove(mos, zapobj, propname, tx); ASSERT(err == 0 || err == ENOENT); err = zap_remove(mos, zapobj, inheritstr, tx); ASSERT(err == 0 || err == ENOENT); zfs_fallthrough; case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): /* * remove propname$recvd */ err = zap_remove(mos, zapobj, recvdstr, tx); ASSERT(err == 0 || err == ENOENT); break; default: cmn_err(CE_PANIC, "unexpected property source: %d", source); } kmem_strfree(inheritstr); kmem_strfree(recvdstr); /* * If we are left with an empty snap zap we can destroy it. * This will prevent unnecessary calls to zap_lookup() in * the "zfs list" and "zfs get" code paths. */ if (ds->ds_is_snapshot && zap_count(mos, zapobj, &count) == 0 && count == 0) { dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_props_obj = 0; zap_destroy(mos, zapobj, tx); } if (isint) { VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval)); if (ds->ds_is_snapshot) { dsl_prop_cb_record_t *cbr; /* * It's a snapshot; nothing can inherit this * property, so just look for callbacks on this * ds here. */ mutex_enter(&ds->ds_dir->dd_lock); for (cbr = list_head(&ds->ds_prop_cbs); cbr; cbr = list_next(&ds->ds_prop_cbs, cbr)) { if (strcmp(cbr->cbr_pr->pr_propname, propname) == 0) cbr->cbr_func(cbr->cbr_arg, intval); } mutex_exit(&ds->ds_dir->dd_lock); } else { dsl_prop_changed_notify(ds->ds_dir->dd_pool, ds->ds_dir->dd_object, propname, intval, TRUE); } (void) snprintf(valbuf, sizeof (valbuf), "%lld", (longlong_t)intval); valstr = valbuf; } else { if (source == ZPROP_SRC_LOCAL) { valstr = value; } else { tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); if (dsl_prop_get_ds(ds, propname, 1, ZAP_MAXVALUELEN, tbuf, NULL) == 0) valstr = tbuf; } } spa_history_log_internal_ds(ds, (source == ZPROP_SRC_NONE || source == ZPROP_SRC_INHERITED) ? "inherit" : "set", tx, "%s=%s", propname, (valstr == NULL ? "" : valstr)); if (tbuf != NULL) kmem_free(tbuf, ZAP_MAXVALUELEN); } int dsl_prop_set_int(const char *dsname, const char *propname, zprop_source_t source, uint64_t value) { nvlist_t *nvl = fnvlist_alloc(); int error; fnvlist_add_uint64(nvl, propname, value); error = dsl_props_set(dsname, source, nvl); fnvlist_free(nvl); return (error); } int dsl_prop_set_string(const char *dsname, const char *propname, zprop_source_t source, const char *value) { nvlist_t *nvl = fnvlist_alloc(); int error; fnvlist_add_string(nvl, propname, value); error = dsl_props_set(dsname, source, nvl); fnvlist_free(nvl); return (error); } int dsl_prop_inherit(const char *dsname, const char *propname, zprop_source_t source) { nvlist_t *nvl = fnvlist_alloc(); int error; fnvlist_add_boolean(nvl, propname); error = dsl_props_set(dsname, source, nvl); fnvlist_free(nvl); return (error); } int dsl_props_set_check(void *arg, dmu_tx_t *tx) { dsl_props_set_arg_t *dpsa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; uint64_t version; nvpair_t *elem = NULL; int err; err = dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds); if (err != 0) return (err); version = spa_version(ds->ds_dir->dd_pool->dp_spa); while ((elem = nvlist_next_nvpair(dpsa->dpsa_props, elem)) != NULL) { if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENAMETOOLONG)); } if (nvpair_type(elem) == DATA_TYPE_STRING) { char *valstr = fnvpair_value_string(elem); if (strlen(valstr) >= (version < SPA_VERSION_STMF_PROP ? ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(E2BIG)); } } } if (ds->ds_is_snapshot && version < SPA_VERSION_SNAP_PROPS) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOTSUP)); } dsl_dataset_rele(ds, FTAG); return (0); } void dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source, nvlist_t *props, dmu_tx_t *tx) { nvpair_t *elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { nvpair_t *pair = elem; const char *name = nvpair_name(pair); if (nvpair_type(pair) == DATA_TYPE_NVLIST) { /* * This usually happens when we reuse the nvlist_t data * returned by the counterpart dsl_prop_get_all_impl(). * For instance we do this to restore the original * received properties when an error occurs in the * zfs_ioc_recv() codepath. */ nvlist_t *attrs = fnvpair_value_nvlist(pair); pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); } if (nvpair_type(pair) == DATA_TYPE_STRING) { const char *value = fnvpair_value_string(pair); dsl_prop_set_sync_impl(ds, name, source, 1, strlen(value) + 1, value, tx); } else if (nvpair_type(pair) == DATA_TYPE_UINT64) { uint64_t intval = fnvpair_value_uint64(pair); dsl_prop_set_sync_impl(ds, name, source, sizeof (intval), 1, &intval, tx); } else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) { dsl_prop_set_sync_impl(ds, name, source, 0, 0, NULL, tx); } else { panic("invalid nvpair type"); } } } void dsl_props_set_sync(void *arg, dmu_tx_t *tx) { dsl_props_set_arg_t *dpsa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds)); dsl_props_set_sync_impl(ds, dpsa->dpsa_source, dpsa->dpsa_props, tx); dsl_dataset_rele(ds, FTAG); } /* * All-or-nothing; if any prop can't be set, nothing will be modified. */ int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) { dsl_props_set_arg_t dpsa; int nblks = 0; dpsa.dpsa_dsname = dsname; dpsa.dpsa_source = source; dpsa.dpsa_props = props; /* * If the source includes NONE, then we will only be removing entries * from the ZAP object. In that case don't check for ENOSPC. */ if ((source & ZPROP_SRC_NONE) == 0) nblks = 2 * fnvlist_num_pairs(props); return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync, &dpsa, nblks, ZFS_SPACE_CHECK_RESERVED)); } typedef enum dsl_prop_getflags { DSL_PROP_GET_INHERITING = 0x1, /* searching parent of target ds */ DSL_PROP_GET_SNAPSHOT = 0x2, /* snapshot dataset */ DSL_PROP_GET_LOCAL = 0x4, /* local properties */ DSL_PROP_GET_RECEIVED = 0x8, /* received properties */ } dsl_prop_getflags_t; static int dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj, const char *setpoint, dsl_prop_getflags_t flags, nvlist_t *nv) { zap_cursor_t zc; zap_attribute_t za; int err = 0; for (zap_cursor_init(&zc, mos, propobj); (err = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { nvlist_t *propval; zfs_prop_t prop; char buf[ZAP_MAXNAMELEN]; char *valstr; const char *suffix; const char *propname; const char *source; suffix = strchr(za.za_name, '$'); if (suffix == NULL) { /* * Skip local properties if we only want received * properties. */ if (flags & DSL_PROP_GET_RECEIVED) continue; propname = za.za_name; source = setpoint; } else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) { /* Skip explicitly inherited entries. */ continue; } else if (strcmp(suffix, ZPROP_RECVD_SUFFIX) == 0) { if (flags & DSL_PROP_GET_LOCAL) continue; - (void) strncpy(buf, za.za_name, (suffix - za.za_name)); - buf[suffix - za.za_name] = '\0'; + (void) strlcpy(buf, za.za_name, + MIN(sizeof (buf), suffix - za.za_name + 1)); propname = buf; if (!(flags & DSL_PROP_GET_RECEIVED)) { /* Skip if locally overridden. */ err = zap_contains(mos, propobj, propname); if (err == 0) continue; if (err != ENOENT) break; /* Skip if explicitly inherited. */ valstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); err = zap_contains(mos, propobj, valstr); kmem_strfree(valstr); if (err == 0) continue; if (err != ENOENT) break; } source = ((flags & DSL_PROP_GET_INHERITING) ? setpoint : ZPROP_SOURCE_VAL_RECVD); } else { /* * For backward compatibility, skip suffixes we don't * recognize. */ continue; } prop = zfs_name_to_prop(propname); /* Skip non-inheritable properties. */ if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_USERPROP && !zfs_prop_inheritable(prop)) continue; /* Skip properties not valid for this type. */ if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_USERPROP && !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT, B_FALSE)) continue; /* Skip properties already defined. */ if (nvlist_exists(nv, propname)) continue; VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (za.za_integer_length == 1) { /* * String property */ char *tmp = kmem_alloc(za.za_num_integers, KM_SLEEP); err = zap_lookup(mos, propobj, za.za_name, 1, za.za_num_integers, tmp); if (err != 0) { kmem_free(tmp, za.za_num_integers); break; } VERIFY(nvlist_add_string(propval, ZPROP_VALUE, tmp) == 0); kmem_free(tmp, za.za_num_integers); } else { /* * Integer property */ ASSERT(za.za_integer_length == 8); (void) nvlist_add_uint64(propval, ZPROP_VALUE, za.za_first_integer); } VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0); VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); nvlist_free(propval); } zap_cursor_fini(&zc); if (err == ENOENT) err = 0; return (err); } /* * Iterate over all properties for this dataset and return them in an nvlist. */ static int dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, dsl_prop_getflags_t flags) { dsl_dir_t *dd = ds->ds_dir; dsl_pool_t *dp = dd->dd_pool; objset_t *mos = dp->dp_meta_objset; int err = 0; char setpoint[ZFS_MAX_DATASET_NAME_LEN]; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (ds->ds_is_snapshot) flags |= DSL_PROP_GET_SNAPSHOT; ASSERT(dsl_pool_config_held(dp)); if (dsl_dataset_phys(ds)->ds_props_obj != 0) { ASSERT(flags & DSL_PROP_GET_SNAPSHOT); dsl_dataset_name(ds, setpoint); err = dsl_prop_get_all_impl(mos, dsl_dataset_phys(ds)->ds_props_obj, setpoint, flags, *nvp); if (err) goto out; } for (; dd != NULL; dd = dd->dd_parent) { if (dd != ds->ds_dir || (flags & DSL_PROP_GET_SNAPSHOT)) { if (flags & (DSL_PROP_GET_LOCAL | DSL_PROP_GET_RECEIVED)) break; flags |= DSL_PROP_GET_INHERITING; } dsl_dir_name(dd, setpoint); err = dsl_prop_get_all_impl(mos, dsl_dir_phys(dd)->dd_props_zapobj, setpoint, flags, *nvp); if (err) break; } out: if (err) { nvlist_free(*nvp); *nvp = NULL; } return (err); } boolean_t dsl_prop_get_hasrecvd(const char *dsname) { uint64_t dummy; return (0 == dsl_prop_get_integer(dsname, ZPROP_HAS_RECVD, &dummy, NULL)); } static int dsl_prop_set_hasrecvd_impl(const char *dsname, zprop_source_t source) { uint64_t version; spa_t *spa; int error = 0; VERIFY0(spa_open(dsname, &spa, FTAG)); version = spa_version(spa); spa_close(spa, FTAG); if (version >= SPA_VERSION_RECVD_PROPS) error = dsl_prop_set_int(dsname, ZPROP_HAS_RECVD, source, 0); return (error); } /* * Call after successfully receiving properties to ensure that only the first * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties. */ int dsl_prop_set_hasrecvd(const char *dsname) { int error = 0; if (!dsl_prop_get_hasrecvd(dsname)) error = dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_LOCAL); return (error); } void dsl_prop_unset_hasrecvd(const char *dsname) { VERIFY0(dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_NONE)); } int dsl_prop_get_all(objset_t *os, nvlist_t **nvp) { return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, 0)); } int dsl_prop_get_received(const char *dsname, nvlist_t **nvp) { objset_t *os; int error; /* * Received properties are not distinguishable from local properties * until the dataset has received properties on or after * SPA_VERSION_RECVD_PROPS. */ dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(dsname) ? DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL); error = dmu_objset_hold(dsname, FTAG, &os); if (error != 0) return (error); error = dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags); dmu_objset_rele(os, FTAG); return (error); } void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value) { nvlist_t *propval; const char *propname = zfs_prop_to_name(prop); uint64_t default_value; if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) { VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); return; } VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); /* Indicate the default source if we can. */ if (dodefault(prop, 8, 1, &default_value) == 0 && value == default_value) { VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0); } VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); nvlist_free(propval); } void dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value) { nvlist_t *propval; const char *propname = zfs_prop_to_name(prop); if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) { VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); return; } VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); nvlist_free(propval); } #if defined(_KERNEL) EXPORT_SYMBOL(dsl_prop_register); EXPORT_SYMBOL(dsl_prop_unregister); EXPORT_SYMBOL(dsl_prop_unregister_all); EXPORT_SYMBOL(dsl_prop_get); EXPORT_SYMBOL(dsl_prop_get_integer); EXPORT_SYMBOL(dsl_prop_get_all); EXPORT_SYMBOL(dsl_prop_get_received); EXPORT_SYMBOL(dsl_prop_get_ds); EXPORT_SYMBOL(dsl_prop_get_int_ds); EXPORT_SYMBOL(dsl_prop_get_dd); EXPORT_SYMBOL(dsl_props_set); EXPORT_SYMBOL(dsl_prop_set_int); EXPORT_SYMBOL(dsl_prop_set_string); EXPORT_SYMBOL(dsl_prop_inherit); EXPORT_SYMBOL(dsl_prop_predict); EXPORT_SYMBOL(dsl_prop_nvlist_add_uint64); EXPORT_SYMBOL(dsl_prop_nvlist_add_string); #endif diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index decf4ddae6af..daab1d6fce71 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1,2959 +1,2959 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K . All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include #include #include #include /* * SPA locking * * There are three basic locks for managing spa_t structures: * * spa_namespace_lock (global mutex) * * This lock must be acquired to do any of the following: * * - Lookup a spa_t by name * - Add or remove a spa_t from the namespace * - Increase spa_refcount from non-zero * - Check if spa_refcount is zero * - Rename a spa_t * - add/remove/attach/detach devices * - Held for the duration of create/destroy/import/export * * It does not need to handle recursion. A create or destroy may * reference objects (files or zvols) in other pools, but by * definition they must have an existing reference, and will never need * to lookup a spa_t by name. * * spa_refcount (per-spa zfs_refcount_t protected by mutex) * * This reference count keep track of any active users of the spa_t. The * spa_t cannot be destroyed or freed while this is non-zero. Internally, * the refcount is never really 'zero' - opening a pool implicitly keeps * some references in the DMU. Internally we check against spa_minref, but * present the image of a zero/non-zero value to consumers. * * spa_config_lock[] (per-spa array of rwlocks) * * This protects the spa_t from config changes, and must be held in * the following circumstances: * * - RW_READER to perform I/O to the spa * - RW_WRITER to change the vdev config * * The locking order is fairly straightforward: * * spa_namespace_lock -> spa_refcount * * The namespace lock must be acquired to increase the refcount from 0 * or to check if it is zero. * * spa_refcount -> spa_config_lock[] * * There must be at least one valid reference on the spa_t to acquire * the config lock. * * spa_namespace_lock -> spa_config_lock[] * * The namespace lock must always be taken before the config lock. * * * The spa_namespace_lock can be acquired directly and is globally visible. * * The namespace is manipulated using the following functions, all of which * require the spa_namespace_lock to be held. * * spa_lookup() Lookup a spa_t by name. * * spa_add() Create a new spa_t in the namespace. * * spa_remove() Remove a spa_t from the namespace. This also * frees up any memory associated with the spa_t. * * spa_next() Returns the next spa_t in the system, or the * first if NULL is passed. * * spa_evict_all() Shutdown and remove all spa_t structures in * the system. * * spa_guid_exists() Determine whether a pool/device guid exists. * * The spa_refcount is manipulated using the following functions: * * spa_open_ref() Adds a reference to the given spa_t. Must be * called with spa_namespace_lock held if the * refcount is currently zero. * * spa_close() Remove a reference from the spa_t. This will * not free the spa_t or remove it from the * namespace. No locking is required. * * spa_refcount_zero() Returns true if the refcount is currently * zero. Must be called with spa_namespace_lock * held. * * The spa_config_lock[] is an array of rwlocks, ordered as follows: * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). * * To read the configuration, it suffices to hold one of these locks as reader. * To modify the configuration, you must hold all locks as writer. To modify * vdev state without altering the vdev tree's topology (e.g. online/offline), * you must hold SCL_STATE and SCL_ZIO as writer. * * We use these distinct config locks to avoid recursive lock entry. * For example, spa_sync() (which holds SCL_CONFIG as reader) induces * block allocations (SCL_ALLOC), which may require reading space maps * from disk (dmu_read() -> zio_read() -> SCL_ZIO). * * The spa config locks cannot be normal rwlocks because we need the * ability to hand off ownership. For example, SCL_ZIO is acquired * by the issuing thread and later released by an interrupt thread. * They do, however, obey the usual write-wanted semantics to prevent * writer (i.e. system administrator) starvation. * * The lock acquisition rules are as follows: * * SCL_CONFIG * Protects changes to the vdev tree topology, such as vdev * add/remove/attach/detach. Protects the dirty config list * (spa_config_dirty_list) and the set of spares and l2arc devices. * * SCL_STATE * Protects changes to pool state and vdev state, such as vdev * online/offline/fault/degrade/clear. Protects the dirty state list * (spa_state_dirty_list) and global pool state (spa_state). * * SCL_ALLOC * Protects changes to metaslab groups and classes. * Held as reader by metaslab_alloc() and metaslab_claim(). * * SCL_ZIO * Held by bp-level zios (those which have no io_vd upon entry) * to prevent changes to the vdev tree. The bp-level zio implicitly * protects all of its vdev child zios, which do not hold SCL_ZIO. * * SCL_FREE * Protects changes to metaslab groups and classes. * Held as reader by metaslab_free(). SCL_FREE is distinct from * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free * blocks in zio_done() while another i/o that holds either * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. * * SCL_VDEV * Held as reader to prevent changes to the vdev tree during trivial * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the * other locks, and lower than all of them, to ensure that it's safe * to acquire regardless of caller context. * * In addition, the following rules apply: * * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. * The lock ordering is SCL_CONFIG > spa_props_lock. * * (b) I/O operations on leaf vdevs. For any zio operation that takes * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), * or zio_write_phys() -- the caller must ensure that the config cannot * cannot change in the interim, and that the vdev cannot be reopened. * SCL_STATE as reader suffices for both. * * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). * * spa_vdev_enter() Acquire the namespace lock and the config lock * for writing. * * spa_vdev_exit() Release the config lock, wait for all I/O * to complete, sync the updated configs to the * cache, and release the namespace lock. * * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual * locking is, always, based on spa_namespace_lock and spa_config_lock[]. */ static avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; static kcondvar_t spa_namespace_cv; static const int spa_max_replication_override = SPA_DVAS_PER_BP; static kmutex_t spa_spare_lock; static avl_tree_t spa_spare_avl; static kmutex_t spa_l2cache_lock; static avl_tree_t spa_l2cache_avl; spa_mode_t spa_mode_global = SPA_MODE_UNINIT; #ifdef ZFS_DEBUG /* * Everything except dprintf, set_error, spa, and indirect_remap is on * by default in debug builds. */ int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR | ZFS_DEBUG_INDIRECT_REMAP); #else int zfs_flags = 0; #endif /* * zfs_recover can be set to nonzero to attempt to recover from * otherwise-fatal errors, typically caused by on-disk corruption. When * set, calls to zfs_panic_recover() will turn into warning messages. * This should only be used as a last resort, as it typically results * in leaked space, or worse. */ int zfs_recover = B_FALSE; /* * If destroy encounters an EIO while reading metadata (e.g. indirect * blocks), space referenced by the missing metadata can not be freed. * Normally this causes the background destroy to become "stalled", as * it is unable to make forward progress. While in this stalled state, * all remaining space to free from the error-encountering filesystem is * "temporarily leaked". Set this flag to cause it to ignore the EIO, * permanently leak the space from indirect blocks that can not be read, * and continue to free everything else that it can. * * The default, "stalling" behavior is useful if the storage partially * fails (i.e. some but not all i/os fail), and then later recovers. In * this case, we will be able to continue pool operations while it is * partially failed, and when it recovers, we can continue to free the * space, with no leaks. However, note that this case is actually * fairly rare. * * Typically pools either (a) fail completely (but perhaps temporarily, * e.g. a top-level vdev going offline), or (b) have localized, * permanent errors (e.g. disk returns the wrong data due to bit flip or * firmware bug). In case (a), this setting does not matter because the * pool will be suspended and the sync thread will not be able to make * forward progress regardless. In case (b), because the error is * permanent, the best we can do is leak the minimum amount of space, * which is what setting this flag will do. Therefore, it is reasonable * for this flag to normally be set, but we chose the more conservative * approach of not setting it, so that there is no possibility of * leaking space in the "partial temporary" failure case. */ int zfs_free_leak_on_eio = B_FALSE; /* * Expiration time in milliseconds. This value has two meanings. First it is * used to determine when the spa_deadman() logic should fire. By default the * spa_deadman() will fire if spa_sync() has not completed in 600 seconds. * Secondly, the value determines if an I/O is considered "hung". Any I/O that * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting * in one of three behaviors controlled by zfs_deadman_failmode. */ unsigned long zfs_deadman_synctime_ms = 600000UL; /* 10 min. */ /* * This value controls the maximum amount of time zio_wait() will block for an * outstanding IO. By default this is 300 seconds at which point the "hung" * behavior will be applied as described for zfs_deadman_synctime_ms. */ unsigned long zfs_deadman_ziotime_ms = 300000UL; /* 5 min. */ /* * Check time in milliseconds. This defines the frequency at which we check * for hung I/O. */ unsigned long zfs_deadman_checktime_ms = 60000UL; /* 1 min. */ /* * By default the deadman is enabled. */ int zfs_deadman_enabled = B_TRUE; /* * Controls the behavior of the deadman when it detects a "hung" I/O. * Valid values are zfs_deadman_failmode=. * * wait - Wait for the "hung" I/O (default) * continue - Attempt to recover from a "hung" I/O * panic - Panic the system */ const char *zfs_deadman_failmode = "wait"; /* * The worst case is single-sector max-parity RAID-Z blocks, in which * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) * times the size; so just assume that. Add to this the fact that * we can have up to 3 DVAs per bp, and one more factor of 2 because * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, * the worst case is: * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 */ int spa_asize_inflation = 24; /* * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in * the pool to be consumed (bounded by spa_max_slop). This ensures that we * don't run the pool completely out of space, due to unaccounted changes (e.g. * to the MOS). It also limits the worst-case time to allocate space. If we * have less than this amount of free space, most ZPL operations (e.g. write, * create) will return ENOSPC. The ZIL metaslabs (spa_embedded_log_class) are * also part of this 3.2% of space which can't be consumed by normal writes; * the slop space "proper" (spa_get_slop_space()) is decreased by the embedded * log space. * * Certain operations (e.g. file removal, most administrative actions) can * use half the slop space. They will only return ENOSPC if less than half * the slop space is free. Typically, once the pool has less than the slop * space free, the user will use these operations to free up space in the pool. * These are the operations that call dsl_pool_adjustedsize() with the netfree * argument set to TRUE. * * Operations that are almost guaranteed to free up space in the absence of * a pool checkpoint can use up to three quarters of the slop space * (e.g zfs destroy). * * A very restricted set of operations are always permitted, regardless of * the amount of free space. These are the operations that call * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net * increase in the amount of space used, it is possible to run the pool * completely out of space, causing it to be permanently read-only. * * Note that on very small pools, the slop space will be larger than * 3.2%, in an effort to have it be at least spa_min_slop (128MB), * but we never allow it to be more than half the pool size. * * Further, on very large pools, the slop space will be smaller than * 3.2%, to avoid reserving much more space than we actually need; bounded * by spa_max_slop (128GB). * * See also the comments in zfs_space_check_t. */ int spa_slop_shift = 5; static const uint64_t spa_min_slop = 128ULL * 1024 * 1024; static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024; static const int spa_allocators = 4; void spa_load_failed(spa_t *spa, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name, spa->spa_trust_config ? "trusted" : "untrusted", buf); } void spa_load_note(spa_t *spa, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name, spa->spa_trust_config ? "trusted" : "untrusted", buf); } /* * By default dedup and user data indirects land in the special class */ static int zfs_ddt_data_is_special = B_TRUE; static int zfs_user_indirect_is_special = B_TRUE; /* * The percentage of special class final space reserved for metadata only. * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only * let metadata into the class. */ static int zfs_special_class_metadata_reserve_pct = 25; /* * ========================================================================== * SPA config locking * ========================================================================== */ static void spa_config_lock_init(spa_t *spa) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); scl->scl_writer = NULL; scl->scl_write_wanted = 0; scl->scl_count = 0; } } static void spa_config_lock_destroy(spa_t *spa) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_destroy(&scl->scl_lock); cv_destroy(&scl->scl_cv); ASSERT(scl->scl_writer == NULL); ASSERT(scl->scl_write_wanted == 0); ASSERT(scl->scl_count == 0); } } int spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { if (scl->scl_writer || scl->scl_write_wanted) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks & ((1 << i) - 1), tag); return (0); } } else { ASSERT(scl->scl_writer != curthread); if (scl->scl_count != 0) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks & ((1 << i) - 1), tag); return (0); } scl->scl_writer = curthread; } scl->scl_count++; mutex_exit(&scl->scl_lock); } return (1); } void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) { (void) tag; int wlocks_held = 0; ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (scl->scl_writer == curthread) wlocks_held |= (1 << i); if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { while (scl->scl_writer || scl->scl_write_wanted) { cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { ASSERT(scl->scl_writer != curthread); while (scl->scl_count != 0) { scl->scl_write_wanted++; cv_wait(&scl->scl_cv, &scl->scl_lock); scl->scl_write_wanted--; } scl->scl_writer = curthread; } scl->scl_count++; mutex_exit(&scl->scl_lock); } ASSERT3U(wlocks_held, <=, locks); } void spa_config_exit(spa_t *spa, int locks, const void *tag) { (void) tag; for (int i = SCL_LOCKS - 1; i >= 0; i--) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); ASSERT(scl->scl_count > 0); if (--scl->scl_count == 0) { ASSERT(scl->scl_writer == NULL || scl->scl_writer == curthread); scl->scl_writer = NULL; /* OK in either case */ cv_broadcast(&scl->scl_cv); } mutex_exit(&scl->scl_lock); } } int spa_config_held(spa_t *spa, int locks, krw_t rw) { int locks_held = 0; for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; if ((rw == RW_READER && scl->scl_count != 0) || (rw == RW_WRITER && scl->scl_writer == curthread)) locks_held |= 1 << i; } return (locks_held); } /* * ========================================================================== * SPA namespace functions * ========================================================================== */ /* * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. * Returns NULL if no matching spa_t is found. */ spa_t * spa_lookup(const char *name) { static spa_t search; /* spa_t is large; don't allocate on stack */ spa_t *spa; avl_index_t where; char *cp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); /* * If it's a full dataset name, figure out the pool name and * just use that. */ cp = strpbrk(search.spa_name, "/@#"); if (cp != NULL) *cp = '\0'; spa = avl_find(&spa_namespace_avl, &search, &where); return (spa); } /* * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. * If the zfs_deadman_enabled flag is set then it inspects all vdev queues * looking for potentially hung I/Os. */ void spa_deadman(void *arg) { spa_t *spa = arg; /* Disable the deadman if the pool is suspended. */ if (spa_suspended(spa)) return; zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", (gethrtime() - spa->spa_sync_starttime) / NANOSEC, (u_longlong_t)++spa->spa_deadman_calls); if (zfs_deadman_enabled) vdev_deadman(spa->spa_root_vdev, FTAG); spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + MSEC_TO_TICK(zfs_deadman_checktime_ms)); } static int spa_log_sm_sort_by_txg(const void *va, const void *vb) { const spa_log_sm_t *a = va; const spa_log_sm_t *b = vb; return (TREE_CMP(a->sls_txg, b->sls_txg)); } /* * Create an uninitialized spa_t with the given name. Requires * spa_namespace_lock. The caller must ensure that the spa_t doesn't already * exist by calling spa_lookup() first. */ spa_t * spa_add(const char *name, nvlist_t *config, const char *altroot) { spa_t *spa; spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_activities_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_waiters_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < TXG_SIZE; t++) bplist_create(&spa->spa_free_bplist[t]); (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); spa->spa_state = POOL_STATE_UNINITIALIZED; spa->spa_freeze_txg = UINT64_MAX; spa->spa_final_txg = UINT64_MAX; spa->spa_load_max_txg = UINT64_MAX; spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; spa->spa_trust_config = B_TRUE; spa->spa_hostid = zone_get_hostid(NULL); spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms); spa_set_deadman_failmode(spa, zfs_deadman_failmode); zfs_refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); spa_stats_init(spa); avl_add(&spa_namespace_avl, spa); /* * Set the alternate root, if there is one. */ if (altroot) spa->spa_root = spa_strdup(altroot); spa->spa_alloc_count = spa_allocators; spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count * sizeof (spa_alloc_t), KM_SLEEP); for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, sizeof (zio_t), offsetof(zio_t, io_alloc_node)); } avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg, sizeof (spa_log_sm_t), offsetof(spa_log_sm_t, sls_node)); list_create(&spa->spa_log_summary, sizeof (log_summary_entry_t), offsetof(log_summary_entry_t, lse_node)); /* * Every pool starts with the default cachefile */ list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), offsetof(spa_config_dirent_t, scd_link)); dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); list_insert_head(&spa->spa_config_list, dp); VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (config != NULL) { nvlist_t *features; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) == 0) { VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); } VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); } if (spa->spa_label_features == NULL) { VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, KM_SLEEP) == 0); } spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; spa->spa_min_alloc = INT_MAX; /* Reset cached value */ spa->spa_dedup_dspace = ~0ULL; /* * As a pool is being created, treat all features as disabled by * setting SPA_FEATURE_DISABLED for all entries in the feature * refcount cache. */ for (int i = 0; i < SPA_FEATURES; i++) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } list_create(&spa->spa_leaf_list, sizeof (vdev_t), offsetof(vdev_t, vdev_leaf_node)); return (spa); } /* * Removes a spa_t from the namespace, freeing up any memory used. Requires * spa_namespace_lock. This is called only after the spa_t has been closed and * deactivated. */ void spa_remove(spa_t *spa) { spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_state(spa) == POOL_STATE_UNINITIALIZED); ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0); ASSERT0(spa->spa_waiters); nvlist_free(spa->spa_config_splitting); avl_remove(&spa_namespace_avl, spa); cv_broadcast(&spa_namespace_cv); if (spa->spa_root) spa_strfree(spa->spa_root); while ((dp = list_head(&spa->spa_config_list)) != NULL) { list_remove(&spa->spa_config_list, dp); if (dp->scd_path != NULL) spa_strfree(dp->scd_path); kmem_free(dp, sizeof (spa_config_dirent_t)); } for (int i = 0; i < spa->spa_alloc_count; i++) { avl_destroy(&spa->spa_allocs[i].spaa_tree); mutex_destroy(&spa->spa_allocs[i].spaa_lock); } kmem_free(spa->spa_allocs, spa->spa_alloc_count * sizeof (spa_alloc_t)); avl_destroy(&spa->spa_metaslabs_by_flushed); avl_destroy(&spa->spa_sm_logs_by_txg); list_destroy(&spa->spa_log_summary); list_destroy(&spa->spa_config_list); list_destroy(&spa->spa_leaf_list); nvlist_free(spa->spa_label_features); nvlist_free(spa->spa_load_info); nvlist_free(spa->spa_feat_stats); spa_config_set(spa, NULL); zfs_refcount_destroy(&spa->spa_refcount); spa_stats_destroy(spa); spa_config_lock_destroy(spa); for (int t = 0; t < TXG_SIZE; t++) bplist_destroy(&spa->spa_free_bplist[t]); zio_checksum_templates_free(spa); cv_destroy(&spa->spa_async_cv); cv_destroy(&spa->spa_evicting_os_cv); cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); cv_destroy(&spa->spa_activities_cv); cv_destroy(&spa->spa_waiters_cv); mutex_destroy(&spa->spa_flushed_ms_lock); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); mutex_destroy(&spa->spa_evicting_os_lock); mutex_destroy(&spa->spa_history_lock); mutex_destroy(&spa->spa_proc_lock); mutex_destroy(&spa->spa_props_lock); mutex_destroy(&spa->spa_cksum_tmpls_lock); mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); mutex_destroy(&spa->spa_feat_stats_lock); mutex_destroy(&spa->spa_activities_lock); kmem_free(spa, sizeof (spa_t)); } /* * Given a pool, return the next pool in the namespace, or NULL if there is * none. If 'prev' is NULL, return the first pool. */ spa_t * spa_next(spa_t *prev) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (prev) return (AVL_NEXT(&spa_namespace_avl, prev)); else return (avl_first(&spa_namespace_avl)); } /* * ========================================================================== * SPA refcount functions * ========================================================================== */ /* * Add a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. */ void spa_open_ref(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref || MUTEX_HELD(&spa_namespace_lock)); (void) zfs_refcount_add(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. */ void spa_close(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref || MUTEX_HELD(&spa_namespace_lock)); (void) zfs_refcount_remove(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t held by a dsl dir that is * being asynchronously released. Async releases occur from a taskq * performing eviction of dsl datasets and dirs. The namespace lock * isn't held and the hold by the object being evicted may contribute to * spa_minref (e.g. dataset or directory released during pool export), * so the asserts in spa_close() do not apply. */ void spa_async_close(spa_t *spa, const void *tag) { (void) zfs_refcount_remove(&spa->spa_refcount, tag); } /* * Check to see if the spa refcount is zero. Must be called with * spa_namespace_lock held. We really compare against spa_minref, which is the * number of references acquired when opening a pool */ boolean_t spa_refcount_zero(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref); } /* * ========================================================================== * SPA spare and l2cache tracking * ========================================================================== */ /* * Hot spares and cache devices are tracked using the same code below, * for 'auxiliary' devices. */ typedef struct spa_aux { uint64_t aux_guid; uint64_t aux_pool; avl_node_t aux_avl; int aux_count; } spa_aux_t; static inline int spa_aux_compare(const void *a, const void *b) { const spa_aux_t *sa = (const spa_aux_t *)a; const spa_aux_t *sb = (const spa_aux_t *)b; return (TREE_CMP(sa->aux_guid, sb->aux_guid)); } static void spa_aux_add(vdev_t *vd, avl_tree_t *avl) { avl_index_t where; spa_aux_t search; spa_aux_t *aux; search.aux_guid = vd->vdev_guid; if ((aux = avl_find(avl, &search, &where)) != NULL) { aux->aux_count++; } else { aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); aux->aux_guid = vd->vdev_guid; aux->aux_count = 1; avl_insert(avl, aux, where); } } static void spa_aux_remove(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search; spa_aux_t *aux; avl_index_t where; search.aux_guid = vd->vdev_guid; aux = avl_find(avl, &search, &where); ASSERT(aux != NULL); if (--aux->aux_count == 0) { avl_remove(avl, aux); kmem_free(aux, sizeof (spa_aux_t)); } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { aux->aux_pool = 0ULL; } } static boolean_t spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) { spa_aux_t search, *found; search.aux_guid = guid; found = avl_find(avl, &search, NULL); if (pool) { if (found) *pool = found->aux_pool; else *pool = 0ULL; } if (refcnt) { if (found) *refcnt = found->aux_count; else *refcnt = 0; } return (found != NULL); } static void spa_aux_activate(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search, *found; avl_index_t where; search.aux_guid = vd->vdev_guid; found = avl_find(avl, &search, &where); ASSERT(found != NULL); ASSERT(found->aux_pool == 0ULL); found->aux_pool = spa_guid(vd->vdev_spa); } /* * Spares are tracked globally due to the following constraints: * * - A spare may be part of multiple pools. * - A spare may be added to a pool even if it's actively in use within * another pool. * - A spare in use in any pool can only be the source of a replacement if * the target is a spare in the same pool. * * We keep track of all spares on the system through the use of a reference * counted AVL tree. When a vdev is added as a spare, or used as a replacement * spare, then we bump the reference count in the AVL tree. In addition, we set * the 'vdev_isspare' member to indicate that the device is a spare (active or * inactive). When a spare is made active (used to replace a device in the * pool), we also keep track of which pool its been made a part of. * * The 'spa_spare_lock' protects the AVL tree. These functions are normally * called under the spa_namespace lock as part of vdev reconfiguration. The * separate spare lock exists for the status query path, which does not need to * be completely consistent with respect to other vdev configuration changes. */ static int spa_spare_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_spare_add(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(!vd->vdev_isspare); spa_aux_add(vd, &spa_spare_avl); vd->vdev_isspare = B_TRUE; mutex_exit(&spa_spare_lock); } void spa_spare_remove(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_remove(vd, &spa_spare_avl); vd->vdev_isspare = B_FALSE; mutex_exit(&spa_spare_lock); } boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) { boolean_t found; mutex_enter(&spa_spare_lock); found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); mutex_exit(&spa_spare_lock); return (found); } void spa_spare_activate(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_activate(vd, &spa_spare_avl); mutex_exit(&spa_spare_lock); } /* * Level 2 ARC devices are tracked globally for the same reasons as spares. * Cache devices currently only support one pool per cache device, and so * for these devices the aux reference count is currently unused beyond 1. */ static int spa_l2cache_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_l2cache_add(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(!vd->vdev_isl2cache); spa_aux_add(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_TRUE; mutex_exit(&spa_l2cache_lock); } void spa_l2cache_remove(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_remove(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_FALSE; mutex_exit(&spa_l2cache_lock); } boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool) { boolean_t found; mutex_enter(&spa_l2cache_lock); found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); return (found); } void spa_l2cache_activate(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_activate(vd, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); } /* * ========================================================================== * SPA vdev locking * ========================================================================== */ /* * Lock the given spa_t for the purpose of adding or removing a vdev. * Grabs the global spa_namespace_lock plus the spa config lock for writing. * It returns the next transaction group for the spa_t. */ uint64_t spa_vdev_enter(spa_t *spa) { mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); vdev_autotrim_stop_all(spa); return (spa_vdev_config_enter(spa)); } /* * The same as spa_vdev_enter() above but additionally takes the guid of * the vdev being detached. When there is a rebuild in process it will be * suspended while the vdev tree is modified then resumed by spa_vdev_exit(). * The rebuild is canceled if only a single child remains after the detach. */ uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid) { mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); vdev_autotrim_stop_all(spa); if (guid != 0) { vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd) { vdev_rebuild_stop_wait(vd->vdev_top); } } return (spa_vdev_config_enter(spa)); } /* * Internal implementation for spa_vdev_enter(). Used when a vdev * operation requires multiple syncs (i.e. removing a device) while * keeping the spa_namespace_lock held. */ uint64_t spa_vdev_config_enter(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); return (spa_last_synced_txg(spa) + 1); } /* * Used in combination with spa_vdev_config_enter() to allow the syncing * of multiple transactions without releasing the spa_namespace_lock. */ void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, const char *tag) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); int config_changed = B_FALSE; ASSERT(txg > spa_last_synced_txg(spa)); spa->spa_pending_vdev = NULL; /* * Reassess the DTLs. */ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE); if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { config_changed = B_TRUE; spa->spa_config_generation++; } /* * Verify the metaslab classes. */ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_embedded_log_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0); spa_config_exit(spa, SCL_ALL, spa); /* * Panic the system if the specified tag requires it. This * is useful for ensuring that configurations are updated * transactionally. */ if (zio_injection_enabled) zio_handle_panic_injection(spa, tag, 0); /* * Note: this txg_wait_synced() is important because it ensures * that there won't be more than one config change per txg. * This allows us to use the txg as the generation number. */ if (error == 0) txg_wait_synced(spa->spa_dsl_pool, txg); if (vd != NULL) { ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); if (vd->vdev_ops->vdev_op_leaf) { mutex_enter(&vd->vdev_initialize_lock); vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, NULL); mutex_exit(&vd->vdev_initialize_lock); mutex_enter(&vd->vdev_trim_lock); vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL); mutex_exit(&vd->vdev_trim_lock); } /* * The vdev may be both a leaf and top-level device. */ vdev_autotrim_stop_wait(vd); spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER); vdev_free(vd); spa_config_exit(spa, SCL_STATE_ALL, spa); } /* * If the config changed, update the config cache. */ if (config_changed) spa_write_cachefile(spa, B_FALSE, B_TRUE); } /* * Unlock the spa_t after adding or removing a vdev. Besides undoing the * locking of spa_vdev_enter(), we also want make sure the transactions have * synced to disk, and then update the global configuration cache with the new * information. */ int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) { vdev_autotrim_restart(spa); vdev_rebuild_restart(spa); spa_vdev_config_exit(spa, vd, txg, error, FTAG); mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Lock the given spa_t for the purpose of changing vdev state. */ void spa_vdev_state_enter(spa_t *spa, int oplocks) { int locks = SCL_STATE_ALL | oplocks; /* * Root pools may need to read of the underlying devfs filesystem * when opening up a vdev. Unfortunately if we're holding the * SCL_ZIO lock it will result in a deadlock when we try to issue * the read from the root filesystem. Instead we "prefetch" * the associated vnodes that we need prior to opening the * underlying devices and cache them so that we can prevent * any I/O when we are doing the actual open. */ if (spa_is_root(spa)) { int low = locks & ~(SCL_ZIO - 1); int high = locks & ~low; spa_config_enter(spa, high, spa, RW_WRITER); vdev_hold(spa->spa_root_vdev); spa_config_enter(spa, low, spa, RW_WRITER); } else { spa_config_enter(spa, locks, spa, RW_WRITER); } spa->spa_vdev_locks = locks; } int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) { boolean_t config_changed = B_FALSE; vdev_t *vdev_top; if (vd == NULL || vd == spa->spa_root_vdev) { vdev_top = spa->spa_root_vdev; } else { vdev_top = vd->vdev_top; } if (vd != NULL || error == 0) vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE); if (vd != NULL) { if (vd != spa->spa_root_vdev) vdev_state_dirty(vdev_top); config_changed = B_TRUE; spa->spa_config_generation++; } if (spa_is_root(spa)) vdev_rele(spa->spa_root_vdev); ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); spa_config_exit(spa, spa->spa_vdev_locks, spa); /* * If anything changed, wait for it to sync. This ensures that, * from the system administrator's perspective, zpool(8) commands * are synchronous. This is important for things like zpool offline: * when the command completes, you expect no further I/O from ZFS. */ if (vd != NULL) txg_wait_synced(spa->spa_dsl_pool, 0); /* * If the config changed, update the config cache. */ if (config_changed) { mutex_enter(&spa_namespace_lock); spa_write_cachefile(spa, B_FALSE, B_TRUE); mutex_exit(&spa_namespace_lock); } return (error); } /* * ========================================================================== * Miscellaneous functions * ========================================================================== */ void spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx) { if (!nvlist_exists(spa->spa_label_features, feature)) { fnvlist_add_boolean(spa->spa_label_features, feature); /* * When we are creating the pool (tx_txg==TXG_INITIAL), we can't * dirty the vdev config because lock SCL_CONFIG is not held. * Thankfully, in this case we don't need to dirty the config * because it will be written out anyway when we finish * creating the pool. */ if (tx->tx_txg != TXG_INITIAL) vdev_config_dirty(spa->spa_root_vdev); } } void spa_deactivate_mos_feature(spa_t *spa, const char *feature) { if (nvlist_remove_all(spa->spa_label_features, feature) == 0) vdev_config_dirty(spa->spa_root_vdev); } /* * Return the spa_t associated with given pool_guid, if it exists. If * device_guid is non-zero, determine whether the pool exists *and* contains * a device with the specified device_guid. */ spa_t * spa_by_guid(uint64_t pool_guid, uint64_t device_guid) { spa_t *spa; avl_tree_t *t = &spa_namespace_avl; ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { if (spa->spa_state == POOL_STATE_UNINITIALIZED) continue; if (spa->spa_root_vdev == NULL) continue; if (spa_guid(spa) == pool_guid) { if (device_guid == 0) break; if (vdev_lookup_by_guid(spa->spa_root_vdev, device_guid) != NULL) break; /* * Check any devices we may be in the process of adding. */ if (spa->spa_pending_vdev) { if (vdev_lookup_by_guid(spa->spa_pending_vdev, device_guid) != NULL) break; } } } return (spa); } /* * Determine whether a pool with the given pool_guid exists. */ boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) { return (spa_by_guid(pool_guid, device_guid) != NULL); } char * spa_strdup(const char *s) { size_t len; char *new; len = strlen(s); new = kmem_alloc(len + 1, KM_SLEEP); memcpy(new, s, len + 1); return (new); } void spa_strfree(char *s) { kmem_free(s, strlen(s) + 1); } uint64_t spa_generate_guid(spa_t *spa) { uint64_t guid; if (spa != NULL) { do { (void) random_get_pseudo_bytes((void *)&guid, sizeof (guid)); } while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)); } else { do { (void) random_get_pseudo_bytes((void *)&guid, sizeof (guid)); } while (guid == 0 || spa_guid_exists(guid, 0)); } return (guid); } void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) { char type[256]; const char *checksum = NULL; const char *compress = NULL; if (bp != NULL) { if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); (void) snprintf(type, sizeof (type), "bswap %s %s", DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? "metadata" : "data", dmu_ot_byteswap[bswap].ob_name); } else { (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, sizeof (type)); } if (!BP_IS_EMBEDDED(bp)) { checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; } compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum, compress); } void spa_freeze(spa_t *spa) { uint64_t freeze_txg = 0; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); if (spa->spa_freeze_txg == UINT64_MAX) { freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; spa->spa_freeze_txg = freeze_txg; } spa_config_exit(spa, SCL_ALL, FTAG); if (freeze_txg != 0) txg_wait_synced(spa_get_dsl(spa), freeze_txg); } void zfs_panic_recover(const char *fmt, ...) { va_list adx; va_start(adx, fmt); vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); va_end(adx); } /* * This is a stripped-down version of strtoull, suitable only for converting * lowercase hexadecimal numbers that don't overflow. */ uint64_t zfs_strtonum(const char *str, char **nptr) { uint64_t val = 0; char c; int digit; while ((c = *str) != '\0') { if (c >= '0' && c <= '9') digit = c - '0'; else if (c >= 'a' && c <= 'f') digit = 10 + c - 'a'; else break; val *= 16; val += digit; str++; } if (nptr) *nptr = (char *)str; return (val); } void spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx) { /* * We bump the feature refcount for each special vdev added to the pool */ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)); spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx); } /* * ========================================================================== * Accessor functions * ========================================================================== */ boolean_t spa_shutting_down(spa_t *spa) { return (spa->spa_async_suspended); } dsl_pool_t * spa_get_dsl(spa_t *spa) { return (spa->spa_dsl_pool); } boolean_t spa_is_initializing(spa_t *spa) { return (spa->spa_is_initializing); } boolean_t spa_indirect_vdevs_loaded(spa_t *spa) { return (spa->spa_indirect_vdevs_loaded); } blkptr_t * spa_get_rootblkptr(spa_t *spa) { return (&spa->spa_ubsync.ub_rootbp); } void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) { spa->spa_uberblock.ub_rootbp = *bp; } void spa_altroot(spa_t *spa, char *buf, size_t buflen) { if (spa->spa_root == NULL) buf[0] = '\0'; else - (void) strncpy(buf, spa->spa_root, buflen); + (void) strlcpy(buf, spa->spa_root, buflen); } int spa_sync_pass(spa_t *spa) { return (spa->spa_sync_pass); } char * spa_name(spa_t *spa) { return (spa->spa_name); } uint64_t spa_guid(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); uint64_t guid; /* * If we fail to parse the config during spa_load(), we can go through * the error path (which posts an ereport) and end up here with no root * vdev. We stash the original pool guid in 'spa_config_guid' to handle * this case. */ if (spa->spa_root_vdev == NULL) return (spa->spa_config_guid); guid = spa->spa_last_synced_guid != 0 ? spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; /* * Return the most recently synced out guid unless we're * in syncing context. */ if (dp && dsl_pool_sync_context(dp)) return (spa->spa_root_vdev->vdev_guid); else return (guid); } uint64_t spa_load_guid(spa_t *spa) { /* * This is a GUID that exists solely as a reference for the * purposes of the arc. It is generated at load time, and * is never written to persistent storage. */ return (spa->spa_load_guid); } uint64_t spa_last_synced_txg(spa_t *spa) { return (spa->spa_ubsync.ub_txg); } uint64_t spa_first_txg(spa_t *spa) { return (spa->spa_first_txg); } uint64_t spa_syncing_txg(spa_t *spa) { return (spa->spa_syncing_txg); } /* * Return the last txg where data can be dirtied. The final txgs * will be used to just clear out any deferred frees that remain. */ uint64_t spa_final_dirty_txg(spa_t *spa) { return (spa->spa_final_txg - TXG_DEFER_SIZE); } pool_state_t spa_state(spa_t *spa) { return (spa->spa_state); } spa_load_state_t spa_load_state(spa_t *spa) { return (spa->spa_load_state); } uint64_t spa_freeze_txg(spa_t *spa) { return (spa->spa_freeze_txg); } /* * Return the inflated asize for a logical write in bytes. This is used by the * DMU to calculate the space a logical write will require on disk. * If lsize is smaller than the largest physical block size allocatable on this * pool we use its value instead, since the write will end up using the whole * block anyway. */ uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) { if (lsize == 0) return (0); /* No inflation needed */ return (MAX(lsize, 1 << spa->spa_max_ashift) * spa_asize_inflation); } /* * Return the amount of slop space in bytes. It is typically 1/32 of the pool * (3.2%), minus the embedded log space. On very small pools, it may be * slightly larger than this. On very large pools, it will be capped to * the value of spa_max_slop. The embedded log space is not included in * spa_dspace. By subtracting it, the usable space (per "zfs list") is a * constant 97% of the total space, regardless of metaslab size (assuming the * default spa_slop_shift=5 and a non-tiny pool). * * See the comment above spa_slop_shift for more details. */ uint64_t spa_get_slop_space(spa_t *spa) { uint64_t space = 0; uint64_t slop = 0; /* * Make sure spa_dedup_dspace has been set. */ if (spa->spa_dedup_dspace == ~0ULL) spa_update_dspace(spa); /* * spa_get_dspace() includes the space only logically "used" by * deduplicated data, so since it's not useful to reserve more * space with more deduplicated data, we subtract that out here. */ space = spa_get_dspace(spa) - spa->spa_dedup_dspace; slop = MIN(space >> spa_slop_shift, spa_max_slop); /* * Subtract the embedded log space, but no more than half the (3.2%) * unusable space. Note, the "no more than half" is only relevant if * zfs_embedded_slog_min_ms >> spa_slop_shift < 2, which is not true by * default. */ uint64_t embedded_log = metaslab_class_get_dspace(spa_embedded_log_class(spa)); slop -= MIN(embedded_log, slop >> 1); /* * Slop space should be at least spa_min_slop, but no more than half * the entire pool. */ slop = MAX(slop, MIN(space >> 1, spa_min_slop)); return (slop); } uint64_t spa_get_dspace(spa_t *spa) { return (spa->spa_dspace); } uint64_t spa_get_checkpoint_space(spa_t *spa) { return (spa->spa_checkpoint_info.sci_dspace); } void spa_update_dspace(spa_t *spa) { spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + ddt_get_dedup_dspace(spa); if (spa->spa_nonallocating_dspace > 0) { /* * Subtract the space provided by all non-allocating vdevs that * contribute to dspace. If a file is overwritten, its old * blocks are freed and new blocks are allocated. If there are * no snapshots of the file, the available space should remain * the same. The old blocks could be freed from the * non-allocating vdev, but the new blocks must be allocated on * other (allocating) vdevs. By reserving the entire size of * the non-allocating vdevs (including allocated space), we * ensure that there will be enough space on the allocating * vdevs for this file overwrite to succeed. * * Note that the DMU/DSL doesn't actually know or care * how much space is allocated (it does its own tracking * of how much space has been logically used). So it * doesn't matter that the data we are moving may be * allocated twice (on the old device and the new device). */ ASSERT3U(spa->spa_dspace, >=, spa->spa_nonallocating_dspace); spa->spa_dspace -= spa->spa_nonallocating_dspace; } } /* * Return the failure mode that has been set to this pool. The default * behavior will be to block all I/Os when a complete failure occurs. */ uint64_t spa_get_failmode(spa_t *spa) { return (spa->spa_failmode); } boolean_t spa_suspended(spa_t *spa) { return (spa->spa_suspended != ZIO_SUSPEND_NONE); } uint64_t spa_version(spa_t *spa) { return (spa->spa_ubsync.ub_version); } boolean_t spa_deflate(spa_t *spa) { return (spa->spa_deflate); } metaslab_class_t * spa_normal_class(spa_t *spa) { return (spa->spa_normal_class); } metaslab_class_t * spa_log_class(spa_t *spa) { return (spa->spa_log_class); } metaslab_class_t * spa_embedded_log_class(spa_t *spa) { return (spa->spa_embedded_log_class); } metaslab_class_t * spa_special_class(spa_t *spa) { return (spa->spa_special_class); } metaslab_class_t * spa_dedup_class(spa_t *spa) { return (spa->spa_dedup_class); } /* * Locate an appropriate allocation class */ metaslab_class_t * spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, uint_t level, uint_t special_smallblk) { /* * ZIL allocations determine their class in zio_alloc_zil(). */ ASSERT(objtype != DMU_OT_INTENT_LOG); boolean_t has_special_class = spa->spa_special_class->mc_groups != 0; if (DMU_OT_IS_DDT(objtype)) { if (spa->spa_dedup_class->mc_groups != 0) return (spa_dedup_class(spa)); else if (has_special_class && zfs_ddt_data_is_special) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } /* Indirect blocks for user data can land in special if allowed */ if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) { if (has_special_class && zfs_user_indirect_is_special) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } if (DMU_OT_IS_METADATA(objtype) || level > 0) { if (has_special_class) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } /* * Allow small file blocks in special class in some cases (like * for the dRAID vdev feature). But always leave a reserve of * zfs_special_class_metadata_reserve_pct exclusively for metadata. */ if (DMU_OT_IS_FILE(objtype) && has_special_class && size <= special_smallblk) { metaslab_class_t *special = spa_special_class(spa); uint64_t alloc = metaslab_class_get_alloc(special); uint64_t space = metaslab_class_get_space(special); uint64_t limit = (space * (100 - zfs_special_class_metadata_reserve_pct)) / 100; if (alloc < limit) return (special); } return (spa_normal_class(spa)); } void spa_evicting_os_register(spa_t *spa, objset_t *os) { mutex_enter(&spa->spa_evicting_os_lock); list_insert_head(&spa->spa_evicting_os_list, os); mutex_exit(&spa->spa_evicting_os_lock); } void spa_evicting_os_deregister(spa_t *spa, objset_t *os) { mutex_enter(&spa->spa_evicting_os_lock); list_remove(&spa->spa_evicting_os_list, os); cv_broadcast(&spa->spa_evicting_os_cv); mutex_exit(&spa->spa_evicting_os_lock); } void spa_evicting_os_wait(spa_t *spa) { mutex_enter(&spa->spa_evicting_os_lock); while (!list_is_empty(&spa->spa_evicting_os_list)) cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); mutex_exit(&spa->spa_evicting_os_lock); dmu_buf_user_evict_wait(); } int spa_max_replication(spa_t *spa) { /* * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to * handle BPs with more than one DVA allocated. Set our max * replication level accordingly. */ if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) return (1); return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); } int spa_prev_software_version(spa_t *spa) { return (spa->spa_prev_software_version); } uint64_t spa_deadman_synctime(spa_t *spa) { return (spa->spa_deadman_synctime); } spa_autotrim_t spa_get_autotrim(spa_t *spa) { return (spa->spa_autotrim); } uint64_t spa_deadman_ziotime(spa_t *spa) { return (spa->spa_deadman_ziotime); } uint64_t spa_get_deadman_failmode(spa_t *spa) { return (spa->spa_deadman_failmode); } void spa_set_deadman_failmode(spa_t *spa, const char *failmode) { if (strcmp(failmode, "wait") == 0) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT; else if (strcmp(failmode, "continue") == 0) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_CONTINUE; else if (strcmp(failmode, "panic") == 0) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; else spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT; } void spa_set_deadman_ziotime(hrtime_t ns) { spa_t *spa = NULL; if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa->spa_deadman_ziotime = ns; mutex_exit(&spa_namespace_lock); } } void spa_set_deadman_synctime(hrtime_t ns) { spa_t *spa = NULL; if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa->spa_deadman_synctime = ns; mutex_exit(&spa_namespace_lock); } } uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { uint64_t asize = DVA_GET_ASIZE(dva); uint64_t dsize = asize; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (asize != 0 && spa->spa_deflate) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd != NULL) dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; } return (dsize); } uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); return (dsize); } uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); spa_config_exit(spa, SCL_VDEV, FTAG); return (dsize); } uint64_t spa_dirty_data(spa_t *spa) { return (spa->spa_dsl_pool->dp_dirty_total); } /* * ========================================================================== * SPA Import Progress Routines * ========================================================================== */ typedef struct spa_import_progress { uint64_t pool_guid; /* unique id for updates */ char *pool_name; spa_load_state_t spa_load_state; uint64_t mmp_sec_remaining; /* MMP activity check */ uint64_t spa_load_max_txg; /* rewind txg */ procfs_list_node_t smh_node; } spa_import_progress_t; spa_history_list_t *spa_import_progress_list = NULL; static int spa_import_progress_show_header(struct seq_file *f) { seq_printf(f, "%-20s %-14s %-14s %-12s %s\n", "pool_guid", "load_state", "multihost_secs", "max_txg", "pool_name"); return (0); } static int spa_import_progress_show(struct seq_file *f, void *data) { spa_import_progress_t *sip = (spa_import_progress_t *)data; seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %s\n", (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state, (u_longlong_t)sip->mmp_sec_remaining, (u_longlong_t)sip->spa_load_max_txg, (sip->pool_name ? sip->pool_name : "-")); return (0); } /* Remove oldest elements from list until there are no more than 'size' left */ static void spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size) { spa_import_progress_t *sip; while (shl->size > size) { sip = list_remove_head(&shl->procfs_list.pl_list); if (sip->pool_name) spa_strfree(sip->pool_name); kmem_free(sip, sizeof (spa_import_progress_t)); shl->size--; } IMPLY(size == 0, list_is_empty(&shl->procfs_list.pl_list)); } static void spa_import_progress_init(void) { spa_import_progress_list = kmem_zalloc(sizeof (spa_history_list_t), KM_SLEEP); spa_import_progress_list->size = 0; spa_import_progress_list->procfs_list.pl_private = spa_import_progress_list; procfs_list_install("zfs", NULL, "import_progress", 0644, &spa_import_progress_list->procfs_list, spa_import_progress_show, spa_import_progress_show_header, NULL, offsetof(spa_import_progress_t, smh_node)); } static void spa_import_progress_destroy(void) { spa_history_list_t *shl = spa_import_progress_list; procfs_list_uninstall(&shl->procfs_list); spa_import_progress_truncate(shl, 0); procfs_list_destroy(&shl->procfs_list); kmem_free(shl, sizeof (spa_history_list_t)); } int spa_import_progress_set_state(uint64_t pool_guid, spa_load_state_t load_state) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; int error = ENOENT; if (shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->spa_load_state = load_state; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; int error = ENOENT; if (shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->spa_load_max_txg = load_max_txg; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } int spa_import_progress_set_mmp_check(uint64_t pool_guid, uint64_t mmp_sec_remaining) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; int error = ENOENT; if (shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->mmp_sec_remaining = mmp_sec_remaining; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } /* * A new import is in progress, add an entry. */ void spa_import_progress_add(spa_t *spa) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; char *poolname = NULL; sip = kmem_zalloc(sizeof (spa_import_progress_t), KM_SLEEP); sip->pool_guid = spa_guid(spa); (void) nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME, &poolname); if (poolname == NULL) poolname = spa_name(spa); sip->pool_name = spa_strdup(poolname); sip->spa_load_state = spa_load_state(spa); mutex_enter(&shl->procfs_list.pl_lock); procfs_list_add(&shl->procfs_list, sip); shl->size++; mutex_exit(&shl->procfs_list.pl_lock); } void spa_import_progress_remove(uint64_t pool_guid) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { if (sip->pool_name) spa_strfree(sip->pool_name); list_remove(&shl->procfs_list.pl_list, sip); shl->size--; kmem_free(sip, sizeof (spa_import_progress_t)); break; } } mutex_exit(&shl->procfs_list.pl_lock); } /* * ========================================================================== * Initialization and Termination * ========================================================================== */ static int spa_name_compare(const void *a1, const void *a2) { const spa_t *s1 = a1; const spa_t *s2 = a2; int s; s = strcmp(s1->spa_name, s2->spa_name); return (TREE_ISIGN(s)); } void spa_boot_init(void) { spa_config_load(); } void spa_init(spa_mode_t mode) { mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), offsetof(spa_t, spa_avl)); avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); spa_mode_global = mode; #ifndef _KERNEL if (spa_mode_global != SPA_MODE_READ && dprintf_find_string("watch")) { struct sigaction sa; sa.sa_flags = SA_SIGINFO; sigemptyset(&sa.sa_mask); sa.sa_sigaction = arc_buf_sigsegv; if (sigaction(SIGSEGV, &sa, NULL) == -1) { perror("could not enable watchpoints: " "sigaction(SIGSEGV, ...) = "); } else { arc_watch = B_TRUE; } } #endif fm_init(); zfs_refcount_init(); unique_init(); zfs_btree_init(); metaslab_stat_init(); ddt_init(); zio_init(); dmu_init(); zil_init(); vdev_cache_stat_init(); vdev_mirror_stat_init(); vdev_raidz_math_init(); vdev_file_init(); zfs_prop_init(); chksum_init(); zpool_prop_init(); zpool_feature_init(); spa_config_load(); vdev_prop_init(); l2arc_start(); scan_init(); qat_init(); spa_import_progress_init(); } void spa_fini(void) { l2arc_stop(); spa_evict_all(); vdev_file_fini(); vdev_cache_stat_fini(); vdev_mirror_stat_fini(); vdev_raidz_math_fini(); chksum_fini(); zil_fini(); dmu_fini(); zio_fini(); ddt_fini(); metaslab_stat_fini(); zfs_btree_fini(); unique_fini(); zfs_refcount_fini(); fm_fini(); scan_fini(); qat_fini(); spa_import_progress_destroy(); avl_destroy(&spa_namespace_avl); avl_destroy(&spa_spare_avl); avl_destroy(&spa_l2cache_avl); cv_destroy(&spa_namespace_cv); mutex_destroy(&spa_namespace_lock); mutex_destroy(&spa_spare_lock); mutex_destroy(&spa_l2cache_lock); } /* * Return whether this pool has a dedicated slog device. No locking needed. * It's not a problem if the wrong answer is returned as it's only for * performance and not correctness. */ boolean_t spa_has_slogs(spa_t *spa) { return (spa->spa_log_class->mc_groups != 0); } spa_log_state_t spa_get_log_state(spa_t *spa) { return (spa->spa_log_state); } void spa_set_log_state(spa_t *spa, spa_log_state_t state) { spa->spa_log_state = state; } boolean_t spa_is_root(spa_t *spa) { return (spa->spa_is_root); } boolean_t spa_writeable(spa_t *spa) { return (!!(spa->spa_mode & SPA_MODE_WRITE) && spa->spa_trust_config); } /* * Returns true if there is a pending sync task in any of the current * syncing txg, the current quiescing txg, or the current open txg. */ boolean_t spa_has_pending_synctask(spa_t *spa) { return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) || !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks)); } spa_mode_t spa_mode(spa_t *spa) { return (spa->spa_mode); } uint64_t spa_bootfs(spa_t *spa) { return (spa->spa_bootfs); } uint64_t spa_delegation(spa_t *spa) { return (spa->spa_delegation); } objset_t * spa_meta_objset(spa_t *spa) { return (spa->spa_meta_objset); } enum zio_checksum spa_dedup_checksum(spa_t *spa) { return (spa->spa_dedup_checksum); } /* * Reset pool scan stat per scan pass (or reboot). */ void spa_scan_stat_init(spa_t *spa) { /* data not stored on disk */ spa->spa_scan_pass_start = gethrestime_sec(); if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan)) spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start; else spa->spa_scan_pass_scrub_pause = 0; spa->spa_scan_pass_scrub_spent_paused = 0; spa->spa_scan_pass_exam = 0; spa->spa_scan_pass_issued = 0; vdev_scan_stat_init(spa->spa_root_vdev); } /* * Get scan stats for zpool status reports */ int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) { dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) return (SET_ERROR(ENOENT)); memset(ps, 0, sizeof (pool_scan_stat_t)); /* data stored on disk */ ps->pss_func = scn->scn_phys.scn_func; ps->pss_state = scn->scn_phys.scn_state; ps->pss_start_time = scn->scn_phys.scn_start_time; ps->pss_end_time = scn->scn_phys.scn_end_time; ps->pss_to_examine = scn->scn_phys.scn_to_examine; ps->pss_examined = scn->scn_phys.scn_examined; ps->pss_to_process = scn->scn_phys.scn_to_process; ps->pss_processed = scn->scn_phys.scn_processed; ps->pss_errors = scn->scn_phys.scn_errors; /* data not stored on disk */ ps->pss_pass_exam = spa->spa_scan_pass_exam; ps->pss_pass_start = spa->spa_scan_pass_start; ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause; ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused; ps->pss_pass_issued = spa->spa_scan_pass_issued; ps->pss_issued = scn->scn_issued_before_pass + spa->spa_scan_pass_issued; return (0); } int spa_maxblocksize(spa_t *spa) { if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) return (SPA_MAXBLOCKSIZE); else return (SPA_OLD_MAXBLOCKSIZE); } /* * Returns the txg that the last device removal completed. No indirect mappings * have been added since this txg. */ uint64_t spa_get_last_removal_txg(spa_t *spa) { uint64_t vdevid; uint64_t ret = -1ULL; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* * sr_prev_indirect_vdev is only modified while holding all the * config locks, so it is sufficient to hold SCL_VDEV as reader when * examining it. */ vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev; while (vdevid != -1ULL) { vdev_t *vd = vdev_lookup_top(spa, vdevid); vdev_indirect_births_t *vib = vd->vdev_indirect_births; ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); /* * If the removal did not remap any data, we don't care. */ if (vdev_indirect_births_count(vib) != 0) { ret = vdev_indirect_births_last_entry_txg(vib); break; } vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev; } spa_config_exit(spa, SCL_VDEV, FTAG); IMPLY(ret != -1ULL, spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); return (ret); } int spa_maxdnodesize(spa_t *spa) { if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) return (DNODE_MAX_SIZE); else return (DNODE_MIN_SIZE); } boolean_t spa_multihost(spa_t *spa) { return (spa->spa_multihost ? B_TRUE : B_FALSE); } uint32_t spa_get_hostid(spa_t *spa) { return (spa->spa_hostid); } boolean_t spa_trust_config(spa_t *spa) { return (spa->spa_trust_config); } uint64_t spa_missing_tvds_allowed(spa_t *spa) { return (spa->spa_missing_tvds_allowed); } space_map_t * spa_syncing_log_sm(spa_t *spa) { return (spa->spa_syncing_log_sm); } void spa_set_missing_tvds(spa_t *spa, uint64_t missing) { spa->spa_missing_tvds = missing; } /* * Return the pool state string ("ONLINE", "DEGRADED", "SUSPENDED", etc). */ const char * spa_state_to_name(spa_t *spa) { ASSERT3P(spa, !=, NULL); /* * it is possible for the spa to exist, without root vdev * as the spa transitions during import/export */ vdev_t *rvd = spa->spa_root_vdev; if (rvd == NULL) { return ("TRANSITIONING"); } vdev_state_t state = rvd->vdev_state; vdev_aux_t aux = rvd->vdev_stat.vs_aux; if (spa_suspended(spa) && (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)) return ("SUSPENDED"); switch (state) { case VDEV_STATE_CLOSED: case VDEV_STATE_OFFLINE: return ("OFFLINE"); case VDEV_STATE_REMOVED: return ("REMOVED"); case VDEV_STATE_CANT_OPEN: if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG) return ("FAULTED"); else if (aux == VDEV_AUX_SPLIT_POOL) return ("SPLIT"); else return ("UNAVAIL"); case VDEV_STATE_FAULTED: return ("FAULTED"); case VDEV_STATE_DEGRADED: return ("DEGRADED"); case VDEV_STATE_HEALTHY: return ("ONLINE"); default: break; } return ("UNKNOWN"); } boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { if (!vdev_is_spacemap_addressable(rvd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t spa_has_checkpoint(spa_t *spa) { return (spa->spa_checkpoint_txg != 0); } boolean_t spa_importing_readonly_checkpoint(spa_t *spa) { return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) && spa->spa_mode == SPA_MODE_READ); } uint64_t spa_min_claim_txg(spa_t *spa) { uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg; if (checkpoint_txg != 0) return (checkpoint_txg + 1); return (spa->spa_first_txg); } /* * If there is a checkpoint, async destroys may consume more space from * the pool instead of freeing it. In an attempt to save the pool from * getting suspended when it is about to run out of space, we stop * processing async destroys. */ boolean_t spa_suspend_async_destroy(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); uint64_t unreserved = dsl_pool_unreserved_space(dp, ZFS_SPACE_CHECK_EXTRA_RESERVED); uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; uint64_t avail = (unreserved > used) ? (unreserved - used) : 0; if (spa_has_checkpoint(spa) && avail == 0) return (B_TRUE); return (B_FALSE); } #if defined(_KERNEL) int param_set_deadman_failmode_common(const char *val) { spa_t *spa = NULL; char *p; if (val == NULL) return (SET_ERROR(EINVAL)); if ((p = strchr(val, '\n')) != NULL) *p = '\0'; if (strcmp(val, "wait") != 0 && strcmp(val, "continue") != 0 && strcmp(val, "panic")) return (SET_ERROR(EINVAL)); if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa_set_deadman_failmode(spa, val); mutex_exit(&spa_namespace_lock); } return (0); } #endif /* Namespace manipulation */ EXPORT_SYMBOL(spa_lookup); EXPORT_SYMBOL(spa_add); EXPORT_SYMBOL(spa_remove); EXPORT_SYMBOL(spa_next); /* Refcount functions */ EXPORT_SYMBOL(spa_open_ref); EXPORT_SYMBOL(spa_close); EXPORT_SYMBOL(spa_refcount_zero); /* Pool configuration lock */ EXPORT_SYMBOL(spa_config_tryenter); EXPORT_SYMBOL(spa_config_enter); EXPORT_SYMBOL(spa_config_exit); EXPORT_SYMBOL(spa_config_held); /* Pool vdev add/remove lock */ EXPORT_SYMBOL(spa_vdev_enter); EXPORT_SYMBOL(spa_vdev_exit); /* Pool vdev state change lock */ EXPORT_SYMBOL(spa_vdev_state_enter); EXPORT_SYMBOL(spa_vdev_state_exit); /* Accessor functions */ EXPORT_SYMBOL(spa_shutting_down); EXPORT_SYMBOL(spa_get_dsl); EXPORT_SYMBOL(spa_get_rootblkptr); EXPORT_SYMBOL(spa_set_rootblkptr); EXPORT_SYMBOL(spa_altroot); EXPORT_SYMBOL(spa_sync_pass); EXPORT_SYMBOL(spa_name); EXPORT_SYMBOL(spa_guid); EXPORT_SYMBOL(spa_last_synced_txg); EXPORT_SYMBOL(spa_first_txg); EXPORT_SYMBOL(spa_syncing_txg); EXPORT_SYMBOL(spa_version); EXPORT_SYMBOL(spa_state); EXPORT_SYMBOL(spa_load_state); EXPORT_SYMBOL(spa_freeze_txg); EXPORT_SYMBOL(spa_get_dspace); EXPORT_SYMBOL(spa_update_dspace); EXPORT_SYMBOL(spa_deflate); EXPORT_SYMBOL(spa_normal_class); EXPORT_SYMBOL(spa_log_class); EXPORT_SYMBOL(spa_special_class); EXPORT_SYMBOL(spa_preferred_class); EXPORT_SYMBOL(spa_max_replication); EXPORT_SYMBOL(spa_prev_software_version); EXPORT_SYMBOL(spa_get_failmode); EXPORT_SYMBOL(spa_suspended); EXPORT_SYMBOL(spa_bootfs); EXPORT_SYMBOL(spa_delegation); EXPORT_SYMBOL(spa_meta_objset); EXPORT_SYMBOL(spa_maxblocksize); EXPORT_SYMBOL(spa_maxdnodesize); /* Miscellaneous support routines */ EXPORT_SYMBOL(spa_guid_exists); EXPORT_SYMBOL(spa_strdup); EXPORT_SYMBOL(spa_strfree); EXPORT_SYMBOL(spa_generate_guid); EXPORT_SYMBOL(snprintf_blkptr); EXPORT_SYMBOL(spa_freeze); EXPORT_SYMBOL(spa_upgrade); EXPORT_SYMBOL(spa_evict_all); EXPORT_SYMBOL(spa_lookup_by_guid); EXPORT_SYMBOL(spa_has_spare); EXPORT_SYMBOL(dva_get_dsize_sync); EXPORT_SYMBOL(bp_get_dsize_sync); EXPORT_SYMBOL(bp_get_dsize); EXPORT_SYMBOL(spa_has_slogs); EXPORT_SYMBOL(spa_is_root); EXPORT_SYMBOL(spa_writeable); EXPORT_SYMBOL(spa_mode); EXPORT_SYMBOL(spa_namespace_lock); EXPORT_SYMBOL(spa_trust_config); EXPORT_SYMBOL(spa_missing_tvds_allowed); EXPORT_SYMBOL(spa_set_missing_tvds); EXPORT_SYMBOL(spa_state_to_name); EXPORT_SYMBOL(spa_importing_readonly_checkpoint); EXPORT_SYMBOL(spa_min_claim_txg); EXPORT_SYMBOL(spa_suspend_async_destroy); EXPORT_SYMBOL(spa_has_checkpoint); EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable); ZFS_MODULE_PARAM(zfs, zfs_, flags, UINT, ZMOD_RW, "Set additional debugging flags"); ZFS_MODULE_PARAM(zfs, zfs_, recover, INT, ZMOD_RW, "Set to attempt to recover from fatal errors"); ZFS_MODULE_PARAM(zfs, zfs_, free_leak_on_eio, INT, ZMOD_RW, "Set to ignore IO errors during free and permanently leak the space"); ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, checktime_ms, ULONG, ZMOD_RW, "Dead I/O check interval in milliseconds"); ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, enabled, INT, ZMOD_RW, "Enable deadman timer"); ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, INT, ZMOD_RW, "SPA size estimate multiplication factor"); ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW, "Place DDT data into the special class"); ZFS_MODULE_PARAM(zfs, zfs_, user_indirect_is_special, INT, ZMOD_RW, "Place user data indirect blocks into the special class"); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode, param_set_deadman_failmode, param_get_charp, ZMOD_RW, "Failmode for deadman timer"); ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, synctime_ms, param_set_deadman_synctime, param_get_ulong, ZMOD_RW, "Pool sync expiration time in milliseconds"); ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, ziotime_ms, param_set_deadman_ziotime, param_get_ulong, ZMOD_RW, "IO expiration time in milliseconds"); ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, INT, ZMOD_RW, "Small file blocks in special vdevs depends on this much " "free space available"); /* END CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, param_get_int, ZMOD_RW, "Reserved free space in pool"); diff --git a/module/zfs/zcp_get.c b/module/zfs/zcp_get.c index 8230a4193662..cd17374eb422 100644 --- a/module/zfs/zcp_get.c +++ b/module/zfs/zcp_get.c @@ -1,810 +1,809 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2016 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif static int get_objset_type(dsl_dataset_t *ds, zfs_type_t *type) { int error; objset_t *os; error = dmu_objset_from_ds(ds, &os); if (error != 0) return (error); if (ds->ds_is_snapshot) { *type = ZFS_TYPE_SNAPSHOT; } else { switch (os->os_phys->os_type) { case DMU_OST_ZFS: *type = ZFS_TYPE_FILESYSTEM; break; case DMU_OST_ZVOL: *type = ZFS_TYPE_VOLUME; break; default: return (EINVAL); } } return (0); } /* * Returns the string name of ds's type in str (a buffer which should be * at least 12 bytes long). */ static int get_objset_type_name(dsl_dataset_t *ds, char *str) { zfs_type_t type = ZFS_TYPE_INVALID; int error = get_objset_type(ds, &type); if (error != 0) return (error); switch (type) { case ZFS_TYPE_SNAPSHOT: (void) strlcpy(str, "snapshot", ZAP_MAXVALUELEN); break; case ZFS_TYPE_FILESYSTEM: (void) strlcpy(str, "filesystem", ZAP_MAXVALUELEN); break; case ZFS_TYPE_VOLUME: (void) strlcpy(str, "volume", ZAP_MAXVALUELEN); break; default: return (EINVAL); } return (0); } /* * Determines the source of a property given its setpoint and * property type. It pushes the source to the lua stack. */ static void get_prop_src(lua_State *state, const char *setpoint, zfs_prop_t prop) { if (zfs_prop_readonly(prop) || (prop == ZFS_PROP_VERSION)) { lua_pushnil(state); } else { const char *src; if (strcmp("", setpoint) == 0) { src = "default"; } else { src = setpoint; } (void) lua_pushstring(state, src); } } /* * Given an error encountered while getting properties, either longjmp's for * a fatal error or pushes nothing to the stack for a non fatal one. */ static int zcp_handle_error(lua_State *state, const char *dataset_name, const char *property_name, int error) { ASSERT3S(error, !=, 0); if (error == ENOENT) { return (0); } else if (error == EINVAL) { return (luaL_error(state, "property '%s' is not a valid property on dataset '%s'", property_name, dataset_name)); } else if (error == EIO) { return (luaL_error(state, "I/O error while retrieving property '%s' on dataset '%s'", property_name, dataset_name)); } else { return (luaL_error(state, "unexpected error %d while " "retrieving property '%s' on dataset '%s'", error, property_name, dataset_name)); } } /* * Look up a user defined property in the zap object. If it exists, push it * and the setpoint onto the stack, otherwise don't push anything. */ static int zcp_get_user_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name, const char *property_name) { int error; char *buf; char setpoint[ZFS_MAX_DATASET_NAME_LEN]; /* * zcp_dataset_hold will either successfully return the requested * dataset or throw a lua error and longjmp out of the zfs.get_prop call * without returning. */ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); if (ds == NULL) return (1); /* not reached; zcp_dataset_hold() longjmp'd */ buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); error = dsl_prop_get_ds(ds, property_name, 1, ZAP_MAXVALUELEN, buf, setpoint); dsl_dataset_rele(ds, FTAG); if (error != 0) { kmem_free(buf, ZAP_MAXVALUELEN); return (zcp_handle_error(state, dataset_name, property_name, error)); } (void) lua_pushstring(state, buf); (void) lua_pushstring(state, setpoint); kmem_free(buf, ZAP_MAXVALUELEN); return (2); } /* * Check if the property we're looking for is stored in the ds_dir. If so, * return it in the 'val' argument. Return 0 on success and ENOENT and if * the property is not present. */ static int get_dsl_dir_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val) { dsl_dir_t *dd = ds->ds_dir; mutex_enter(&dd->dd_lock); switch (zfs_prop) { case ZFS_PROP_USEDSNAP: *val = dsl_dir_get_usedsnap(dd); break; case ZFS_PROP_USEDCHILD: *val = dsl_dir_get_usedchild(dd); break; case ZFS_PROP_USEDDS: *val = dsl_dir_get_usedds(dd); break; case ZFS_PROP_USEDREFRESERV: *val = dsl_dir_get_usedrefreserv(dd); break; case ZFS_PROP_LOGICALUSED: *val = dsl_dir_get_logicalused(dd); break; default: mutex_exit(&dd->dd_lock); return (SET_ERROR(ENOENT)); } mutex_exit(&dd->dd_lock); return (0); } /* * Check if the property we're looking for is stored at the dsl_dataset or * dsl_dir level. If so, push the property value and source onto the lua stack * and return 0. If it is not present or a failure occurs in lookup, return a * non-zero error value. */ static int get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname, zfs_prop_t zfs_prop) { int error = 0; objset_t *os; uint64_t numval = 0; char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); char setpoint[ZFS_MAX_DATASET_NAME_LEN] = "Internal error - setpoint not determined"; zfs_type_t ds_type = ZFS_TYPE_INVALID; zprop_type_t prop_type = zfs_prop_get_type(zfs_prop); (void) get_objset_type(ds, &ds_type); switch (zfs_prop) { case ZFS_PROP_REFRATIO: numval = dsl_get_refratio(ds); break; case ZFS_PROP_USED: numval = dsl_get_used(ds); break; case ZFS_PROP_CLONES: { nvlist_t *clones = fnvlist_alloc(); error = get_clones_stat_impl(ds, clones); if (error == 0) { /* push list to lua stack */ VERIFY0(zcp_nvlist_to_lua(state, clones, NULL, 0ULL)); /* source */ (void) lua_pushnil(state); } nvlist_free(clones); kmem_free(strval, ZAP_MAXVALUELEN); return (error); } case ZFS_PROP_COMPRESSRATIO: numval = dsl_get_compressratio(ds); break; case ZFS_PROP_CREATION: numval = dsl_get_creation(ds); break; case ZFS_PROP_REFERENCED: numval = dsl_get_referenced(ds); break; case ZFS_PROP_AVAILABLE: numval = dsl_get_available(ds); break; case ZFS_PROP_LOGICALREFERENCED: numval = dsl_get_logicalreferenced(ds); break; case ZFS_PROP_CREATETXG: numval = dsl_get_creationtxg(ds); break; case ZFS_PROP_GUID: numval = dsl_get_guid(ds); break; case ZFS_PROP_UNIQUE: numval = dsl_get_unique(ds); break; case ZFS_PROP_OBJSETID: numval = dsl_get_objsetid(ds); break; case ZFS_PROP_ORIGIN: dsl_dir_get_origin(ds->ds_dir, strval); break; case ZFS_PROP_USERACCOUNTING: error = dmu_objset_from_ds(ds, &os); if (error == 0) numval = dmu_objset_userspace_present(os); break; case ZFS_PROP_WRITTEN: error = dsl_get_written(ds, &numval); break; case ZFS_PROP_TYPE: error = get_objset_type_name(ds, strval); break; case ZFS_PROP_PREV_SNAP: error = dsl_get_prev_snap(ds, strval); break; case ZFS_PROP_NAME: dsl_dataset_name(ds, strval); break; case ZFS_PROP_MOUNTPOINT: error = dsl_get_mountpoint(ds, dsname, strval, setpoint); break; case ZFS_PROP_VERSION: /* should be a snapshot or filesystem */ ASSERT(ds_type != ZFS_TYPE_VOLUME); error = dmu_objset_from_ds(ds, &os); /* look in the master node for the version */ if (error == 0) { error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, sizeof (numval), 1, &numval); } break; case ZFS_PROP_DEFER_DESTROY: numval = dsl_get_defer_destroy(ds); break; case ZFS_PROP_USERREFS: numval = dsl_get_userrefs(ds); break; case ZFS_PROP_FILESYSTEM_COUNT: error = dsl_dir_get_filesystem_count(ds->ds_dir, &numval); (void) strlcpy(setpoint, "", ZFS_MAX_DATASET_NAME_LEN); break; case ZFS_PROP_SNAPSHOT_COUNT: error = dsl_dir_get_snapshot_count(ds->ds_dir, &numval); (void) strlcpy(setpoint, "", ZFS_MAX_DATASET_NAME_LEN); break; case ZFS_PROP_NUMCLONES: numval = dsl_get_numclones(ds); break; case ZFS_PROP_INCONSISTENT: numval = dsl_get_inconsistent(ds); break; case ZFS_PROP_IVSET_GUID: if (dsl_dataset_is_zapified(ds)) { error = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, DS_FIELD_IVSET_GUID, sizeof (numval), 1, &numval); } else { error = ENOENT; } break; case ZFS_PROP_RECEIVE_RESUME_TOKEN: { char *token = get_receive_resume_token(ds); if (token != NULL) { (void) strlcpy(strval, token, ZAP_MAXVALUELEN); kmem_strfree(token); } else { error = ENOENT; } break; } case ZFS_PROP_VOLSIZE: ASSERT(ds_type == ZFS_TYPE_VOLUME || ds_type == ZFS_TYPE_SNAPSHOT); error = dmu_objset_from_ds(ds, &os); if (error == 0) { error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", sizeof (numval), 1, &numval); } if (error == 0) (void) strlcpy(setpoint, dsname, ZFS_MAX_DATASET_NAME_LEN); break; case ZFS_PROP_VOLBLOCKSIZE: { ASSERT(ds_type == ZFS_TYPE_VOLUME); dmu_object_info_t doi; error = dmu_objset_from_ds(ds, &os); if (error == 0) { error = dmu_object_info(os, ZVOL_OBJ, &doi); if (error == 0) numval = doi.doi_data_block_size; } break; } case ZFS_PROP_KEYSTATUS: case ZFS_PROP_KEYFORMAT: { /* provide defaults in case no crypto obj exists */ setpoint[0] = '\0'; if (zfs_prop == ZFS_PROP_KEYSTATUS) numval = ZFS_KEYSTATUS_NONE; else numval = ZFS_KEYFORMAT_NONE; nvlist_t *nvl, *propval; nvl = fnvlist_alloc(); dsl_dataset_crypt_stats(ds, nvl); if (nvlist_lookup_nvlist(nvl, zfs_prop_to_name(zfs_prop), &propval) == 0) { char *source; (void) nvlist_lookup_uint64(propval, ZPROP_VALUE, &numval); if (nvlist_lookup_string(propval, ZPROP_SOURCE, &source) == 0) strlcpy(setpoint, source, sizeof (setpoint)); } nvlist_free(nvl); break; } case ZFS_PROP_SNAPSHOTS_CHANGED: numval = dsl_dir_snap_cmtime(ds->ds_dir).tv_sec; break; default: /* Did not match these props, check in the dsl_dir */ error = get_dsl_dir_prop(ds, zfs_prop, &numval); } if (error != 0) { kmem_free(strval, ZAP_MAXVALUELEN); return (error); } switch (prop_type) { case PROP_TYPE_NUMBER: { (void) lua_pushnumber(state, numval); break; } case PROP_TYPE_STRING: { (void) lua_pushstring(state, strval); break; } case PROP_TYPE_INDEX: { const char *propval; error = zfs_prop_index_to_string(zfs_prop, numval, &propval); if (error != 0) { kmem_free(strval, ZAP_MAXVALUELEN); return (error); } (void) lua_pushstring(state, propval); break; } } kmem_free(strval, ZAP_MAXVALUELEN); /* Push the source to the stack */ get_prop_src(state, setpoint, zfs_prop); return (0); } /* * Look up a property and its source in the zap object. If the value is * present and successfully retrieved, push the value and source on the * lua stack and return 0. On failure, return a non-zero error value. */ static int get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop) { int error = 0; char setpoint[ZFS_MAX_DATASET_NAME_LEN]; char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); uint64_t numval; const char *prop_name = zfs_prop_to_name(zfs_prop); zprop_type_t prop_type = zfs_prop_get_type(zfs_prop); if (prop_type == PROP_TYPE_STRING) { /* Push value to lua stack */ error = dsl_prop_get_ds(ds, prop_name, 1, ZAP_MAXVALUELEN, strval, setpoint); if (error == 0) (void) lua_pushstring(state, strval); } else { error = dsl_prop_get_ds(ds, prop_name, sizeof (numval), 1, &numval, setpoint); #ifdef _KERNEL /* Fill in temporary value for prop, if applicable */ (void) zfs_get_temporary_prop(ds, zfs_prop, &numval, setpoint); #else kmem_free(strval, ZAP_MAXVALUELEN); return (luaL_error(state, "temporary properties only supported in kernel mode", prop_name)); #endif /* Push value to lua stack */ if (prop_type == PROP_TYPE_INDEX) { const char *propval; error = zfs_prop_index_to_string(zfs_prop, numval, &propval); if (error == 0) (void) lua_pushstring(state, propval); } else { if (error == 0) (void) lua_pushnumber(state, numval); } } kmem_free(strval, ZAP_MAXVALUELEN); if (error == 0) get_prop_src(state, setpoint, zfs_prop); return (error); } /* * Determine whether property is valid for a given dataset */ boolean_t prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop) { zfs_type_t zfs_type = ZFS_TYPE_INVALID; /* properties not supported */ if ((zfs_prop == ZFS_PROP_ISCSIOPTIONS) || (zfs_prop == ZFS_PROP_MOUNTED)) return (B_FALSE); /* if we want the origin prop, ds must be a clone */ if ((zfs_prop == ZFS_PROP_ORIGIN) && (!dsl_dir_is_clone(ds->ds_dir))) return (B_FALSE); int error = get_objset_type(ds, &zfs_type); if (error != 0) return (B_FALSE); return (zfs_prop_valid_for_type(zfs_prop, zfs_type, B_FALSE)); } /* * Look up a given dataset property. On success return 2, the number of * values pushed to the lua stack (property value and source). On a fatal * error, longjmp. On a non fatal error push nothing. */ static int zcp_get_system_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name, zfs_prop_t zfs_prop) { int error; /* * zcp_dataset_hold will either successfully return the requested * dataset or throw a lua error and longjmp out of the zfs.get_prop call * without returning. */ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); if (ds == NULL) return (1); /* not reached; zcp_dataset_hold() longjmp'd */ /* Check that the property is valid for the given dataset */ const char *prop_name = zfs_prop_to_name(zfs_prop); if (!prop_valid_for_ds(ds, zfs_prop)) { dsl_dataset_rele(ds, FTAG); return (0); } /* Check if the property can be accessed directly */ error = get_special_prop(state, ds, dataset_name, zfs_prop); if (error == 0) { dsl_dataset_rele(ds, FTAG); /* The value and source have been pushed by get_special_prop */ return (2); } if (error != ENOENT) { dsl_dataset_rele(ds, FTAG); return (zcp_handle_error(state, dataset_name, prop_name, error)); } /* If we were unable to find it, look in the zap object */ error = get_zap_prop(state, ds, zfs_prop); dsl_dataset_rele(ds, FTAG); if (error != 0) { return (zcp_handle_error(state, dataset_name, prop_name, error)); } /* The value and source have been pushed by get_zap_prop */ return (2); } #ifdef _KERNEL static zfs_userquota_prop_t get_userquota_prop(const char *prop_name) { zfs_userquota_prop_t type; /* Figure out the property type ({user|group}{quota|used}) */ for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) { if (strncmp(prop_name, zfs_userquota_prop_prefixes[type], strlen(zfs_userquota_prop_prefixes[type])) == 0) break; } return (type); } /* * Given the name of a zfs_userquota_prop, this function determines the * prop type as well as the numeric group/user ids based on the string * following the '@' in the property name. On success, returns 0. On failure, * returns a non-zero error. * 'domain' must be free'd by caller using kmem_strfree() */ static int parse_userquota_prop(const char *prop_name, zfs_userquota_prop_t *type, char **domain, uint64_t *rid) { char *cp, *end, *domain_val; *type = get_userquota_prop(prop_name); if (*type >= ZFS_NUM_USERQUOTA_PROPS) return (EINVAL); *rid = 0; cp = strchr(prop_name, '@') + 1; if (strncmp(cp, "S-1-", 4) == 0) { /* * It's a numeric SID (eg "S-1-234-567-89") and we want to * separate the domain id and the rid */ int domain_len = strrchr(cp, '-') - cp; domain_val = kmem_alloc(domain_len + 1, KM_SLEEP); - (void) strncpy(domain_val, cp, domain_len); - domain_val[domain_len] = '\0'; + (void) strlcpy(domain_val, cp, domain_len + 1); cp += domain_len + 1; (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid); if (*end != '\0') { kmem_strfree(domain_val); return (EINVAL); } } else { /* It's only a user/group ID (eg "12345"), just get the rid */ domain_val = NULL; (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid); if (*end != '\0') return (EINVAL); } *domain = domain_val; return (0); } /* * Look up {user|group}{quota|used} property for given dataset. On success * push the value (quota or used amount) and the setpoint. On failure, push * a lua error. */ static int zcp_get_userquota_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name, const char *prop_name) { zfsvfs_t *zfvp; zfsvfs_t *zfsvfs; int error; zfs_userquota_prop_t type; char *domain; uint64_t rid, value = 0; objset_t *os; dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); if (ds == NULL) return (1); /* not reached; zcp_dataset_hold() longjmp'd */ error = parse_userquota_prop(prop_name, &type, &domain, &rid); if (error == 0) { error = dmu_objset_from_ds(ds, &os); if (error == 0) { zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); error = zfsvfs_create_impl(&zfvp, zfsvfs, os); if (error == 0) { error = zfs_userspace_one(zfvp, type, domain, rid, &value); zfsvfs_free(zfvp); } } if (domain != NULL) kmem_strfree(domain); } dsl_dataset_rele(ds, FTAG); if ((value == 0) && ((type == ZFS_PROP_USERQUOTA) || (type == ZFS_PROP_GROUPQUOTA))) error = SET_ERROR(ENOENT); if (error != 0) { return (zcp_handle_error(state, dataset_name, prop_name, error)); } (void) lua_pushnumber(state, value); (void) lua_pushstring(state, dataset_name); return (2); } #endif /* * Determines the name of the snapshot referenced in the written property * name. Returns snapshot name in snap_name, a buffer that must be at least * as large as ZFS_MAX_DATASET_NAME_LEN */ static void parse_written_prop(const char *dataset_name, const char *prop_name, char *snap_name) { ASSERT(zfs_prop_written(prop_name)); const char *name = prop_name + ZFS_WRITTEN_PROP_PREFIX_LEN; if (strchr(name, '@') == NULL) { (void) snprintf(snap_name, ZFS_MAX_DATASET_NAME_LEN, "%s@%s", dataset_name, name); } else { (void) strlcpy(snap_name, name, ZFS_MAX_DATASET_NAME_LEN); } } /* * Look up written@ property for given dataset. On success * push the value and the setpoint. If error is fatal, we will * longjmp, otherwise push nothing. */ static int zcp_get_written_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name, const char *prop_name) { char snap_name[ZFS_MAX_DATASET_NAME_LEN]; uint64_t used, comp, uncomp; dsl_dataset_t *old; int error = 0; parse_written_prop(dataset_name, prop_name, snap_name); dsl_dataset_t *new = zcp_dataset_hold(state, dp, dataset_name, FTAG); if (new == NULL) return (1); /* not reached; zcp_dataset_hold() longjmp'd */ error = dsl_dataset_hold(dp, snap_name, FTAG, &old); if (error != 0) { dsl_dataset_rele(new, FTAG); return (zcp_dataset_hold_error(state, dp, snap_name, error)); } error = dsl_dataset_space_written(old, new, &used, &comp, &uncomp); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); if (error != 0) { return (zcp_handle_error(state, dataset_name, snap_name, error)); } (void) lua_pushnumber(state, used); (void) lua_pushstring(state, dataset_name); return (2); } static int zcp_get_prop(lua_State *state); static const zcp_lib_info_t zcp_get_prop_info = { .name = "get_prop", .func = zcp_get_prop, .pargs = { { .za_name = "dataset", .za_lua_type = LUA_TSTRING }, { .za_name = "property", .za_lua_type = LUA_TSTRING }, {NULL, 0} }, .kwargs = { {NULL, 0} } }; static int zcp_get_prop(lua_State *state) { const char *dataset_name; const char *property_name; dsl_pool_t *dp = zcp_run_info(state)->zri_pool; const zcp_lib_info_t *libinfo = &zcp_get_prop_info; zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); dataset_name = lua_tostring(state, 1); property_name = lua_tostring(state, 2); /* User defined property */ if (zfs_prop_user(property_name)) { return (zcp_get_user_prop(state, dp, dataset_name, property_name)); } /* userspace property */ if (zfs_prop_userquota(property_name)) { #ifdef _KERNEL return (zcp_get_userquota_prop(state, dp, dataset_name, property_name)); #else return (luaL_error(state, "user quota properties only supported in kernel mode", property_name)); #endif } /* written@ property */ if (zfs_prop_written(property_name)) { return (zcp_get_written_prop(state, dp, dataset_name, property_name)); } zfs_prop_t zfs_prop = zfs_name_to_prop(property_name); /* Valid system property */ if (zfs_prop != ZPROP_INVAL) { return (zcp_get_system_prop(state, dp, dataset_name, zfs_prop)); } /* Invalid property name */ return (luaL_error(state, "'%s' is not a valid property", property_name)); } int zcp_load_get_lib(lua_State *state) { lua_pushcclosure(state, zcp_get_prop_info.func, 0); lua_setfield(state, -2, zcp_get_prop_info.name); return (1); } diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 259d68c477de..bafe6fe7dfa4 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1,7887 +1,7887 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Portions Copyright 2012 Pawel Jakub Dawidek * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2018, loli10K . All rights reserved. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. * Copyright (c) 2019, 2021, Klara Inc. * Copyright (c) 2019, Allan Jude */ /* * ZFS ioctls. * * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool. * * There are two ways that we handle ioctls: the legacy way where almost * all of the logic is in the ioctl callback, and the new way where most * of the marshalling is handled in the common entry point, zfsdev_ioctl(). * * Non-legacy ioctls should be registered by calling * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked * from userland by lzc_ioctl(). * * The registration arguments are as follows: * * const char *name * The name of the ioctl. This is used for history logging. If the * ioctl returns successfully (the callback returns 0), and allow_log * is true, then a history log entry will be recorded with the input & * output nvlists. The log entry can be printed with "zpool history -i". * * zfs_ioc_t ioc * The ioctl request number, which userland will pass to ioctl(2). * We want newer versions of libzfs and libzfs_core to run against * existing zfs kernel modules (i.e. a deferred reboot after an update). * Therefore the ioctl numbers cannot change from release to release. * * zfs_secpolicy_func_t *secpolicy * This function will be called before the zfs_ioc_func_t, to * determine if this operation is permitted. It should return EPERM * on failure, and 0 on success. Checks include determining if the * dataset is visible in this zone, and if the user has either all * zfs privileges in the zone (SYS_MOUNT), or has been granted permission * to do this operation on this dataset with "zfs allow". * * zfs_ioc_namecheck_t namecheck * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool * name, a dataset name, or nothing. If the name is not well-formed, * the ioctl will fail and the callback will not be called. * Therefore, the callback can assume that the name is well-formed * (e.g. is null-terminated, doesn't have more than one '@' character, * doesn't have invalid characters). * * zfs_ioc_poolcheck_t pool_check * This specifies requirements on the pool state. If the pool does * not meet them (is suspended or is readonly), the ioctl will fail * and the callback will not be called. If any checks are specified * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME. * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | * POOL_CHECK_READONLY). * * zfs_ioc_key_t *nvl_keys * The list of expected/allowable innvl input keys. This list is used * to validate the nvlist input to the ioctl. * * boolean_t smush_outnvlist * If smush_outnvlist is true, then the output is presumed to be a * list of errors, and it will be "smushed" down to fit into the * caller's buffer, by removing some entries and replacing them with a * single "N_MORE_ERRORS" entry indicating how many were removed. See * nvlist_smush() for details. If smush_outnvlist is false, and the * outnvlist does not fit into the userland-provided buffer, then the * ioctl will fail with ENOMEM. * * zfs_ioc_func_t *func * The callback function that will perform the operation. * * The callback should return 0 on success, or an error number on * failure. If the function fails, the userland ioctl will return -1, * and errno will be set to the callback's return value. The callback * will be called with the following arguments: * * const char *name * The name of the pool or dataset to operate on, from * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the * expected type (pool, dataset, or none). * * nvlist_t *innvl * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or * NULL if no input nvlist was provided. Changes to this nvlist are * ignored. If the input nvlist could not be deserialized, the * ioctl will fail and the callback will not be called. * * nvlist_t *outnvl * The output nvlist, initially empty. The callback can fill it in, * and it will be returned to userland by serializing it into * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization * fails (e.g. because the caller didn't supply a large enough * buffer), then the overall ioctl will fail. See the * 'smush_nvlist' argument above for additional behaviors. * * There are two typical uses of the output nvlist: * - To return state, e.g. property values. In this case, * smush_outnvlist should be false. If the buffer was not large * enough, the caller will reallocate a larger buffer and try * the ioctl again. * * - To return multiple errors from an ioctl which makes on-disk * changes. In this case, smush_outnvlist should be true. * Ioctls which make on-disk modifications should generally not * use the outnvl if they succeed, because the caller can not * distinguish between the operation failing, and * deserialization failing. * * IOCTL Interface Errors * * The following ioctl input errors can be returned: * ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel * ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel * ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing * ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_deleg.h" #include "zfs_comutil.h" #include #include #include kmutex_t zfsdev_state_lock; static zfsdev_state_t *zfsdev_state_list; /* * Limit maximum nvlist size. We don't want users passing in insane values * for zc->zc_nvlist_src_size, since we will need to allocate that much memory. * Defaults to 0=auto which is handled by platform code. */ unsigned long zfs_max_nvlist_src_size = 0; /* * When logging the output nvlist of an ioctl in the on-disk history, limit * the logged size to this many bytes. This must be less than DMU_MAX_ACCESS. * This applies primarily to zfs_ioc_channel_program(). */ static unsigned long zfs_history_output_max = 1024 * 1024; uint_t zfs_fsyncer_key; uint_t zfs_allow_log_key; /* DATA_TYPE_ANY is used when zkey_type can vary. */ #define DATA_TYPE_ANY DATA_TYPE_UNKNOWN typedef struct zfs_ioc_vec { zfs_ioc_legacy_func_t *zvec_legacy_func; zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; zfs_ioc_namecheck_t zvec_namecheck; boolean_t zvec_allow_log; zfs_ioc_poolcheck_t zvec_pool_check; boolean_t zvec_smush_outnvlist; const char *zvec_name; const zfs_ioc_key_t *zvec_nvl_keys; size_t zvec_nvl_key_count; } zfs_ioc_vec_t; /* This array is indexed by zfs_userquota_prop_t */ static const char *userquota_perms[] = { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_PERM_USEROBJUSED, ZFS_DELEG_PERM_USEROBJQUOTA, ZFS_DELEG_PERM_GROUPOBJUSED, ZFS_DELEG_PERM_GROUPOBJQUOTA, ZFS_DELEG_PERM_PROJECTUSED, ZFS_DELEG_PERM_PROJECTQUOTA, ZFS_DELEG_PERM_PROJECTOBJUSED, ZFS_DELEG_PERM_PROJECTOBJQUOTA, }; static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc); static int zfs_check_settable(const char *name, nvpair_t *property, cred_t *cr); static int zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errors); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); static void history_str_free(char *buf) { kmem_free(buf, HIS_MAX_RECORD_LEN); } static char * history_str_get(zfs_cmd_t *zc) { char *buf; if (zc->zc_history == 0) return (NULL); buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); if (copyinstr((void *)(uintptr_t)zc->zc_history, buf, HIS_MAX_RECORD_LEN, NULL) != 0) { history_str_free(buf); return (NULL); } buf[HIS_MAX_RECORD_LEN -1] = '\0'; return (buf); } /* * Return non-zero if the spa version is less than requested version. */ static int zfs_earlier_version(const char *name, int version) { spa_t *spa; if (spa_open(name, &spa, FTAG) == 0) { if (spa_version(spa) < version) { spa_close(spa, FTAG); return (1); } spa_close(spa, FTAG); } return (0); } /* * Return TRUE if the ZPL version is less than requested version. */ static boolean_t zpl_earlier_version(const char *name, int version) { objset_t *os; boolean_t rc = B_TRUE; if (dmu_objset_hold(name, FTAG, &os) == 0) { uint64_t zplversion; if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (B_TRUE); } /* XXX reading from non-owned objset */ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) rc = zplversion < version; dmu_objset_rele(os, FTAG); } return (rc); } static void zfs_log_history(zfs_cmd_t *zc) { spa_t *spa; char *buf; if ((buf = history_str_get(zc)) == NULL) return; if (spa_open(zc->zc_name, &spa, FTAG) == 0) { if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) (void) spa_history_log(spa, buf); spa_close(spa, FTAG); } history_str_free(buf); } /* * Policy for top-level read operations (list pools). Requires no privileges, * and can be used in the local zone, as there is no associated dataset. */ static int zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl, (void) cr; return (0); } /* * Policy for dataset read operations (list children, get statistics). Requires * no privileges, but must be visible in the local zone. */ static int zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl, (void) cr; if (INGLOBALZONE(curproc) || zone_dataset_visible(zc->zc_name, NULL)) return (0); return (SET_ERROR(ENOENT)); } static int zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) { int writable = 1; /* * The dataset must be visible by this zone -- check this first * so they don't see EPERM on something they shouldn't know about. */ if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable)) return (SET_ERROR(ENOENT)); if (INGLOBALZONE(curproc)) { /* * If the fs is zoned, only root can access it from the * global zone. */ if (secpolicy_zfs(cr) && zoned) return (SET_ERROR(EPERM)); } else { /* * If we are in a local zone, the 'zoned' property must be set. */ if (!zoned) return (SET_ERROR(EPERM)); /* must be writable by this zone */ if (!writable) return (SET_ERROR(EPERM)); } return (0); } static int zfs_dozonecheck(const char *dataset, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_integer(dataset, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, const char *perm, cred_t *cr) { int error; error = zfs_dozonecheck_ds(name, ds, cr); if (error == 0) { error = secpolicy_zfs(cr); if (error != 0) error = dsl_deleg_access_impl(ds, perm, cr); } return (error); } static int zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { int error; dsl_dataset_t *ds; dsl_pool_t *dp; /* * First do a quick check for root in the global zone, which * is allowed to do all write_perms. This ensures that zfs_ioc_* * will get to handle nonexistent datasets. */ if (INGLOBALZONE(curproc) && secpolicy_zfs(cr) == 0) return (0); error = dsl_pool_hold(name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * Policy for setting the security label property. * * Returns 0 for success, non-zero for access and other errors. */ static int zfs_set_slabel_policy(const char *name, const char *strval, cred_t *cr) { #ifdef HAVE_MLSLABEL char ds_hexsl[MAXNAMELEN]; bslabel_t ds_sl, new_sl; boolean_t new_default = FALSE; uint64_t zoned; int needed_priv = -1; int error; /* First get the existing dataset label. */ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error != 0) return (SET_ERROR(EPERM)); if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) new_default = TRUE; /* The label must be translatable */ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) return (SET_ERROR(EINVAL)); /* * In a non-global zone, disallow attempts to set a label that * doesn't match that of the zone; otherwise no other checks * are needed. */ if (!INGLOBALZONE(curproc)) { if (new_default || !blequal(&new_sl, CR_SL(CRED()))) return (SET_ERROR(EPERM)); return (0); } /* * For global-zone datasets (i.e., those whose zoned property is * "off", verify that the specified new label is valid for the * global zone. */ if (dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(EPERM)); if (!zoned) { if (zfs_check_global_label(name, strval) != 0) return (SET_ERROR(EPERM)); } /* * If the existing dataset label is nondefault, check if the * dataset is mounted (label cannot be changed while mounted). * Get the zfsvfs_t; if there isn't one, then the dataset isn't * mounted (or isn't a dataset, doesn't exist, ...). */ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { objset_t *os; static const char *setsl_tag = "setsl_tag"; /* * Try to own the dataset; abort if there is any error, * (e.g., already mounted, in use, or other error). */ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, B_TRUE, setsl_tag, &os); if (error != 0) return (SET_ERROR(EPERM)); dmu_objset_disown(os, B_TRUE, setsl_tag); if (new_default) { needed_priv = PRIV_FILE_DOWNGRADE_SL; goto out_check; } if (hexstr_to_label(strval, &new_sl) != 0) return (SET_ERROR(EPERM)); if (blstrictdom(&ds_sl, &new_sl)) needed_priv = PRIV_FILE_DOWNGRADE_SL; else if (blstrictdom(&new_sl, &ds_sl)) needed_priv = PRIV_FILE_UPGRADE_SL; } else { /* dataset currently has a default label */ if (!new_default) needed_priv = PRIV_FILE_UPGRADE_SL; } out_check: if (needed_priv != -1) return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); return (0); #else return (SET_ERROR(ENOTSUP)); #endif /* HAVE_MLSLABEL */ } static int zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, cred_t *cr) { char *strval; /* * Check permissions for special properties. */ switch (prop) { default: break; case ZFS_PROP_ZONED: /* * Disallow setting of 'zoned' from within a local zone. */ if (!INGLOBALZONE(curproc)) return (SET_ERROR(EPERM)); break; case ZFS_PROP_QUOTA: case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (!INGLOBALZONE(curproc)) { uint64_t zoned; char setpoint[ZFS_MAX_DATASET_NAME_LEN]; /* * Unprivileged users are allowed to modify the * limit on things *under* (ie. contained by) * the thing they own. */ if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, setpoint)) return (SET_ERROR(EPERM)); if (!zoned || strlen(dsname) <= strlen(setpoint)) return (SET_ERROR(EPERM)); } break; case ZFS_PROP_MLSLABEL: if (!is_system_labeled()) return (SET_ERROR(EPERM)); if (nvpair_value_string(propval, &strval) == 0) { int err; err = zfs_set_slabel_policy(dsname, strval, CRED()); if (err != 0) return (err); } break; } return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); } static int zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * permission to set permissions will be evaluated later in * dsl_deleg_can_allow() */ (void) innvl; return (zfs_dozonecheck(zc->zc_name, cr)); } static int zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_ROLLBACK, cr)); } static int zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; dsl_pool_t *dp; dsl_dataset_t *ds; const char *cp; int error; /* * Generate the current snapshot name from the given objsetid, then * use that name for the secpolicy/zone checks. */ cp = strchr(zc->zc_name, '@'); if (cp == NULL) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ds, zc->zc_name); error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static int zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_SEND, cr)); } static int zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl, (void) cr; return (SET_ERROR(ENOTSUP)); } static int zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl, (void) cr; return (SET_ERROR(ENOTSUP)); } static int zfs_get_parent(const char *datasetname, char *parent, int parentsize) { char *cp; /* * Remove the @bla or /bla from the end of the name to get the parent. */ - (void) strncpy(parent, datasetname, parentsize); + (void) strlcpy(parent, datasetname, parentsize); cp = strrchr(parent, '@'); if (cp != NULL) { cp[0] = '\0'; } else { cp = strrchr(parent, '/'); if (cp == NULL) return (SET_ERROR(ENOENT)); cp[0] = '\0'; } return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); } static int zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); } /* * Destroying snapshots with delegated permissions requires * descendant mount and destroy permissions. */ static int zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvlist_t *snaps; nvpair_t *pair, *nextpair; int error = 0; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nextpair) { nextpair = nvlist_next_nvpair(snaps, pair); error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr); if (error == ENOENT) { /* * Ignore any snapshots that don't exist (we consider * them "already destroyed"). Remove the name from the * nvl here in case the snapshot is created between * now and when we try to destroy it (in which case * we don't want to destroy it since we haven't * checked for permission). */ fnvlist_remove_nvpair(snaps, pair); error = 0; } if (error != 0) break; } return (error); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_RENAME, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); if ((error = zfs_get_parent(to, parentname, sizeof (parentname))) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (error); } static int zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr)); } static int zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; dsl_pool_t *dp; dsl_dataset_t *clone; int error; error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_PROMOTE, cr); if (error != 0) return (error); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); if (error == 0) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_t *origin = NULL; dsl_dir_t *dd; dd = clone->ds_dir; error = dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin); if (error != 0) { dsl_dataset_rele(clone, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, ZFS_DELEG_PERM_MOUNT, cr); dsl_dataset_name(origin, parentname); if (error == 0) { error = zfs_secpolicy_write_perms_ds(parentname, origin, ZFS_DELEG_PERM_PROMOTE, cr); } dsl_dataset_rele(clone, FTAG); dsl_dataset_rele(origin, FTAG); } dsl_pool_rele(dp, FTAG); return (error); } static int zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RECEIVE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CREATE, cr)); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_SNAPSHOT, cr)); } /* * Check for permission to create each snapshot in the nvlist. */ static int zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvlist_t *snaps; int error = 0; nvpair_t *pair; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char *name = nvpair_name(pair); char *atp = strchr(name, '@'); if (atp == NULL) { error = SET_ERROR(EINVAL); break; } *atp = '\0'; error = zfs_secpolicy_snapshot_perms(name, cr); *atp = '@'; if (error != 0) break; } return (error); } /* * Check for permission to create each bookmark in the nvlist. */ static int zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; int error = 0; for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_BOOKMARK, cr); *hashp = '#'; if (error != 0) break; } return (error); } static int zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvpair_t *pair, *nextpair; int error = 0; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nextpair) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); nextpair = nvlist_next_nvpair(innvl, pair); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr); *hashp = '#'; if (error == ENOENT) { /* * Ignore any filesystems that don't exist (we consider * their bookmarks "already destroyed"). Remove * the name from the nvl here in case the filesystem * is created between now and when we try to destroy * the bookmark (in which case we don't want to * destroy it since we haven't checked for permission). */ fnvlist_remove_nvpair(innvl, pair); error = 0; } if (error != 0) break; } return (error); } static int zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl, (void) cr; /* * Even root must have a proper TSD so that we know what pool * to log to. */ if (tsd_get(zfs_allow_log_key) == NULL) return (SET_ERROR(EPERM)); return (0); } static int zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; char *origin; if ((error = zfs_get_parent(zc->zc_name, parentname, sizeof (parentname))) != 0) return (error); if (nvlist_lookup_string(innvl, "origin", &origin) == 0 && (error = zfs_secpolicy_write_perms(origin, ZFS_DELEG_PERM_CLONE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)); } /* * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires * SYS_CONFIG privilege, which is not available in a local zone. */ int zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl; if (secpolicy_sys_config(cr, B_FALSE) != 0) return (SET_ERROR(EPERM)); return (0); } /* * Policy for object to name lookups. */ static int zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; int error; if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) return (0); error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); return (error); } /* * Policy for fault injection. Requires all privileges. */ static int zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl; return (secpolicy_zinject(cr)); } static int zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); if (prop == ZPROP_USERPROP) { if (!zfs_prop_user(zc->zc_value)) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_USERPROP, cr)); } else { return (zfs_secpolicy_setprop(zc->zc_name, prop, NULL, cr)); } } static int zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); if (zc->zc_value[0] == 0) { /* * They are asking about a posix uid/gid. If it's * themself, allow it. */ if (zc->zc_objset_type == ZFS_PROP_USERUSED || zc->zc_objset_type == ZFS_PROP_USERQUOTA || zc->zc_objset_type == ZFS_PROP_USEROBJUSED || zc->zc_objset_type == ZFS_PROP_USEROBJQUOTA) { if (zc->zc_guid == crgetuid(cr)) return (0); } else if (zc->zc_objset_type == ZFS_PROP_GROUPUSED || zc->zc_objset_type == ZFS_PROP_GROUPQUOTA || zc->zc_objset_type == ZFS_PROP_GROUPOBJUSED || zc->zc_objset_type == ZFS_PROP_GROUPOBJQUOTA) { if (groupmember(zc->zc_guid, cr)) return (0); } /* else is for project quota/used */ } return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } static int zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } static int zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, NULL, cr)); } static int zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvpair_t *pair; nvlist_t *holds; int error; holds = fnvlist_lookup_nvlist(innvl, "holds"); for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_HOLD, cr); if (error != 0) return (error); } return (0); } static int zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvpair_t *pair; int error; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_RELEASE, cr); if (error != 0) return (error); } return (0); } /* * Policy for allowing temporary snapshots to be taken or released */ static int zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * A temporary snapshot is the same as a snapshot, * hold, destroy and release all rolled into one. * Delegated diff alone is sufficient that we allow this. */ int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr)) == 0) return (0); error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); if (innvl != NULL) { if (error == 0) error = zfs_secpolicy_hold(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_release(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_destroy(zc, innvl, cr); } return (error); } static int zfs_secpolicy_load_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_LOAD_KEY, cr)); } static int zfs_secpolicy_change_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CHANGE_KEY, cr)); } /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) { char *packed; int error; nvlist_t *list = NULL; /* * Read in and unpack the user-supplied nvlist. */ if (size == 0) return (SET_ERROR(EINVAL)); packed = vmem_alloc(size, KM_SLEEP); if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag)) != 0) { vmem_free(packed, size); return (SET_ERROR(EFAULT)); } if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { vmem_free(packed, size); return (error); } vmem_free(packed, size); *nvp = list; return (0); } /* * Reduce the size of this nvlist until it can be serialized in 'max' bytes. * Entries will be removed from the end of the nvlist, and one int32 entry * named "N_MORE_ERRORS" will be added indicating how many entries were * removed. */ static int nvlist_smush(nvlist_t *errors, size_t max) { size_t size; size = fnvlist_size(errors); if (size > max) { nvpair_t *more_errors; int n = 0; if (max < 1024) return (SET_ERROR(ENOMEM)); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0); more_errors = nvlist_prev_nvpair(errors, NULL); do { nvpair_t *pair = nvlist_prev_nvpair(errors, more_errors); fnvlist_remove_nvpair(errors, pair); n++; size = fnvlist_size(errors); } while (size > max); fnvlist_remove_nvpair(errors, more_errors); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n); ASSERT3U(fnvlist_size(errors), <=, max); } return (0); } static int put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) { char *packed = NULL; int error = 0; size_t size; size = fnvlist_size(nvl); if (size > zc->zc_nvlist_dst_size) { error = SET_ERROR(ENOMEM); } else { packed = fnvlist_pack(nvl, &size); if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) error = SET_ERROR(EFAULT); fnvlist_pack_free(packed, size); } zc->zc_nvlist_dst_size = size; zc->zc_nvlist_dst_filled = B_TRUE; return (error); } int getzfsvfs_impl(objset_t *os, zfsvfs_t **zfvp) { int error = 0; if (dmu_objset_type(os) != DMU_OST_ZFS) { return (SET_ERROR(EINVAL)); } mutex_enter(&os->os_user_ptr_lock); *zfvp = dmu_objset_get_user(os); /* bump s_active only when non-zero to prevent umount race */ error = zfs_vfs_ref(zfvp); mutex_exit(&os->os_user_ptr_lock); return (error); } int getzfsvfs(const char *dsname, zfsvfs_t **zfvp) { objset_t *os; int error; error = dmu_objset_hold(dsname, FTAG, &os); if (error != 0) return (error); error = getzfsvfs_impl(os, zfvp); dmu_objset_rele(os, FTAG); return (error); } /* * Find a zfsvfs_t for a mounted filesystem, or create our own, in which * case its z_sb will be NULL, and it will be opened as the owner. * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, * which prevents all inode ops from running. */ static int zfsvfs_hold(const char *name, const void *tag, zfsvfs_t **zfvp, boolean_t writer) { int error = 0; if (getzfsvfs(name, zfvp) != 0) error = zfsvfs_create(name, B_FALSE, zfvp); if (error == 0) { if (writer) ZFS_TEARDOWN_ENTER_WRITE(*zfvp, tag); else ZFS_TEARDOWN_ENTER_READ(*zfvp, tag); if ((*zfvp)->z_unmounted) { /* * XXX we could probably try again, since the unmounting * thread should be just about to disassociate the * objset from the zfsvfs. */ ZFS_TEARDOWN_EXIT(*zfvp, tag); return (SET_ERROR(EBUSY)); } } return (error); } static void zfsvfs_rele(zfsvfs_t *zfsvfs, const void *tag) { ZFS_TEARDOWN_EXIT(zfsvfs, tag); if (zfs_vfs_held(zfsvfs)) { zfs_vfs_rele(zfsvfs); } else { dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } } static int zfs_ioc_pool_create(zfs_cmd_t *zc) { int error; nvlist_t *config, *props = NULL; nvlist_t *rootprops = NULL; nvlist_t *zplprops = NULL; dsl_crypto_params_t *dcp = NULL; const char *spa_name = zc->zc_name; boolean_t unload_wkey = B_TRUE; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config))) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (props) { nvlist_t *nvl = NULL; nvlist_t *hidden_args = NULL; uint64_t version = SPA_VERSION; char *tname; (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); if (!SPA_VERSION_IS_SUPPORTED(version)) { error = SET_ERROR(EINVAL); goto pool_props_bad; } (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); if (nvl) { error = nvlist_dup(nvl, &rootprops, KM_SLEEP); if (error != 0) goto pool_props_bad; (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); } (void) nvlist_lookup_nvlist(props, ZPOOL_HIDDEN_ARGS, &hidden_args); error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, rootprops, hidden_args, &dcp); if (error != 0) goto pool_props_bad; (void) nvlist_remove_all(props, ZPOOL_HIDDEN_ARGS); VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops_root(version, rootprops, zplprops, NULL); if (error != 0) goto pool_props_bad; if (nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0) spa_name = tname; } error = spa_create(zc->zc_name, config, props, zplprops, dcp); /* * Set the remaining root properties */ if (!error && (error = zfs_set_prop_nvlist(spa_name, ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) { (void) spa_destroy(spa_name); unload_wkey = B_FALSE; /* spa_destroy() unloads wrapping keys */ } pool_props_bad: nvlist_free(rootprops); nvlist_free(zplprops); nvlist_free(config); nvlist_free(props); dsl_crypto_params_free(dcp, unload_wkey && !!error); return (error); } static int zfs_ioc_pool_destroy(zfs_cmd_t *zc) { int error; zfs_log_history(zc); error = spa_destroy(zc->zc_name); return (error); } static int zfs_ioc_pool_import(zfs_cmd_t *zc) { nvlist_t *config, *props = NULL; uint64_t guid; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) != 0) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != zc->zc_guid) error = SET_ERROR(EINVAL); else error = spa_import(zc->zc_name, config, props, zc->zc_cookie); if (zc->zc_nvlist_dst != 0) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; } nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_pool_export(zfs_cmd_t *zc) { int error; boolean_t force = (boolean_t)zc->zc_cookie; boolean_t hardforce = (boolean_t)zc->zc_guid; zfs_log_history(zc); error = spa_export(zc->zc_name, NULL, force, hardforce); return (error); } static int zfs_ioc_pool_configs(zfs_cmd_t *zc) { nvlist_t *configs; int error; if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) return (SET_ERROR(EEXIST)); error = put_nvlist(zc, configs); nvlist_free(configs); return (error); } /* * inputs: * zc_name name of the pool * * outputs: * zc_cookie real errno * zc_nvlist_dst config nvlist * zc_nvlist_dst_size size of config nvlist */ static int zfs_ioc_pool_stats(zfs_cmd_t *zc) { nvlist_t *config; int error; int ret = 0; error = spa_get_stats(zc->zc_name, &config, zc->zc_value, sizeof (zc->zc_value)); if (config != NULL) { ret = put_nvlist(zc, config); nvlist_free(config); /* * The config may be present even if 'error' is non-zero. * In this case we return success, and preserve the real errno * in 'zc_cookie'. */ zc->zc_cookie = error; } else { ret = error; } return (ret); } /* * Try to import the given pool, returning pool stats as appropriate so that * user land knows which devices are available and overall pool health. */ static int zfs_ioc_pool_tryimport(zfs_cmd_t *zc) { nvlist_t *tryconfig, *config = NULL; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &tryconfig)) != 0) return (error); config = spa_tryimport(tryconfig); nvlist_free(tryconfig); if (config == NULL) return (SET_ERROR(EINVAL)); error = put_nvlist(zc, config); nvlist_free(config); return (error); } /* * inputs: * zc_name name of the pool * zc_cookie scan func (pool_scan_func_t) * zc_flags scrub pause/resume flag (pool_scrub_cmd_t) */ static int zfs_ioc_pool_scan(zfs_cmd_t *zc) { spa_t *spa; int error; if (zc->zc_flags >= POOL_SCRUB_FLAGS_END) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_flags == POOL_SCRUB_PAUSE) error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); else if (zc->zc_cookie == POOL_SCAN_NONE) error = spa_scan_stop(spa); else error = spa_scan(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_freeze(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { spa_freeze(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_pool_upgrade(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_cookie < spa_version(spa) || !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } spa_upgrade(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_history(zfs_cmd_t *zc) { spa_t *spa; char *hist_buf; uint64_t size; int error; if ((size = zc->zc_history_len) == 0) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } hist_buf = vmem_alloc(size, KM_SLEEP); if ((error = spa_history_get(spa, &zc->zc_history_offset, &zc->zc_history_len, hist_buf)) == 0) { error = ddi_copyout(hist_buf, (void *)(uintptr_t)zc->zc_history, zc->zc_history_len, zc->zc_iflags); } spa_close(spa, FTAG); vmem_free(hist_buf, size); return (error); } static int zfs_ioc_pool_reguid(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { error = spa_change_guid(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) { return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_value name of object */ static int zfs_ioc_obj_to_path(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele_flags(os, B_TRUE, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_stat stats on object * zc_value path to object */ static int zfs_ioc_obj_to_stats(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele_flags(os, B_TRUE, FTAG); return (error); } static int zfs_ioc_vdev_add(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *config; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config); if (error == 0) { error = spa_vdev_add(spa, config); nvlist_free(config); } spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of the pool * zc_guid guid of vdev to remove * zc_cookie cancel removal */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); if (zc->zc_cookie != 0) { error = spa_vdev_remove_cancel(spa); } else { error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_set_state(zfs_cmd_t *zc) { spa_t *spa; int error; vdev_state_t newstate = VDEV_STATE_UNKNOWN; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); switch (zc->zc_cookie) { case VDEV_STATE_ONLINE: error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); break; case VDEV_STATE_OFFLINE: error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_FAULTED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL && zc->zc_obj != VDEV_AUX_EXTERNAL_PERSIST) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_DEGRADED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); break; default: error = SET_ERROR(EINVAL); } zc->zc_cookie = newstate; spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_attach(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config; int replacing = zc->zc_cookie; int rebuild = zc->zc_simple; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) == 0) { error = spa_vdev_attach(spa, zc->zc_guid, config, replacing, rebuild); nvlist_free(config); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_detach(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_split(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config, *props = NULL; int error; boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config))) { spa_close(spa, FTAG); return (error); } if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { spa_close(spa, FTAG); nvlist_free(config); return (error); } error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); spa_close(spa, FTAG); nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_vdev_setpath(zfs_cmd_t *zc) { spa_t *spa; const char *path = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setpath(spa, guid, path); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_setfru(zfs_cmd_t *zc) { spa_t *spa; const char *fru = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setfru(spa, guid, fru); spa_close(spa, FTAG); return (error); } static int zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) { int error = 0; nvlist_t *nv; dmu_objset_fast_stat(os, &zc->zc_objset_stats); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_all(os, &nv)) == 0) { dmu_objset_stats(os, nv); /* * NB: zvol_get_stats() will read the objset contents, * which we aren't supposed to do with a * DS_MODE_USER hold, because it could be * inconsistent. So this is a bit of a workaround... * XXX reading without owning */ if (!zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZVOL) { error = zvol_get_stats(os, nv); if (error == EIO) { nvlist_free(nv); return (error); } VERIFY0(error); } if (error == 0) error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error == 0) { error = zfs_ioc_objset_stats_impl(zc, os); dmu_objset_rele(os, FTAG); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_nvlist_dst received property nvlist * zc_nvlist_dst_size size of received property nvlist * * Gets received properties (distinct from local properties on or after * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from * local property values. */ static int zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) { int error = 0; nvlist_t *nv; /* * Without this check, we would return local property values if the * caller has not already received properties on or after * SPA_VERSION_RECVD_PROPS. */ if (!dsl_prop_get_hasrecvd(zc->zc_name)) return (SET_ERROR(ENOTSUP)); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } static int nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) { uint64_t value; int error; /* * zfs_get_zplprop() will either find a value or give us * the default value (if there is one). */ if ((error = zfs_get_zplprop(os, prop, &value)) != 0) return (error); VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); return (0); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for zpl property nvlist * * outputs: * zc_nvlist_dst zpl property nvlist * zc_nvlist_dst_size size of zpl property nvlist */ static int zfs_ioc_objset_zplprops(zfs_cmd_t *zc) { objset_t *os; int err; /* XXX reading without owning */ if ((err = dmu_objset_hold(zc->zc_name, FTAG, &os))) return (err); dmu_objset_fast_stat(os, &zc->zc_objset_stats); /* * NB: nvl_add_zplprop() will read the objset contents, * which we aren't supposed to do with a DS_MODE_USER * hold, because it could be inconsistent. */ if (zc->zc_nvlist_dst != 0 && !zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZFS) { nvlist_t *nv; VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) err = put_nvlist(zc, nv); nvlist_free(nv); } else { err = SET_ERROR(ENOENT); } dmu_objset_rele(os, FTAG); return (err); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_name name of next filesystem * zc_cookie zap cursor * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_dataset_list_next(zfs_cmd_t *zc) { objset_t *os; int error; char *p; size_t orig_len = strlen(zc->zc_name); top: if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) { if (error == ENOENT) error = SET_ERROR(ESRCH); return (error); } p = strrchr(zc->zc_name, '/'); if (p == NULL || p[1] != '\0') (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); do { error = dmu_dir_list_next(os, sizeof (zc->zc_name) - (p - zc->zc_name), p, NULL, &zc->zc_cookie); if (error == ENOENT) error = SET_ERROR(ESRCH); } while (error == 0 && zfs_dataset_name_hidden(zc->zc_name)); dmu_objset_rele(os, FTAG); /* * If it's an internal dataset (ie. with a '$' in its name), * don't try to get stats for it, otherwise we'll return ENOENT. */ if (error == 0 && strchr(zc->zc_name, '$') == NULL) { error = zfs_ioc_objset_stats(zc); /* fill in the stats */ if (error == ENOENT) { /* We lost a race with destroy, get the next one. */ zc->zc_name[orig_len] = '\0'; goto top; } } return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_src iteration range nvlist * zc_nvlist_src_size size of iteration range nvlist * * outputs: * zc_name name of next snapshot * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { int error; objset_t *os, *ossnap; dsl_dataset_t *ds; uint64_t min_txg = 0, max_txg = 0; if (zc->zc_nvlist_src_size != 0) { nvlist_t *props = NULL; error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props); if (error != 0) return (error); (void) nvlist_lookup_uint64(props, SNAP_ITER_MIN_TXG, &min_txg); (void) nvlist_lookup_uint64(props, SNAP_ITER_MAX_TXG, &max_txg); nvlist_free(props); } error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) { return (error == ENOENT ? SET_ERROR(ESRCH) : error); } /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. */ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= ZFS_MAX_DATASET_NAME_LEN) { dmu_objset_rele(os, FTAG); return (SET_ERROR(ESRCH)); } while (error == 0) { if (issig(JUSTLOOKING) && issig(FORREAL)) { error = SET_ERROR(EINTR); break; } error = dmu_snapshot_list_next(os, sizeof (zc->zc_name) - strlen(zc->zc_name), zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie, NULL); if (error == ENOENT) { error = SET_ERROR(ESRCH); break; } else if (error != 0) { break; } error = dsl_dataset_hold_obj(dmu_objset_pool(os), zc->zc_obj, FTAG, &ds); if (error != 0) break; if ((min_txg != 0 && dsl_get_creationtxg(ds) < min_txg) || (max_txg != 0 && dsl_get_creationtxg(ds) > max_txg)) { dsl_dataset_rele(ds, FTAG); /* undo snapshot name append */ *(strchr(zc->zc_name, '@') + 1) = '\0'; /* skip snapshot */ continue; } if (zc->zc_simple) { zc->zc_objset_stats.dds_creation_txg = dsl_get_creationtxg(ds); dsl_dataset_rele(ds, FTAG); break; } if ((error = dmu_objset_from_ds(ds, &ossnap)) != 0) { dsl_dataset_rele(ds, FTAG); break; } if ((error = zfs_ioc_objset_stats_impl(zc, ossnap)) != 0) { dsl_dataset_rele(ds, FTAG); break; } dsl_dataset_rele(ds, FTAG); break; } dmu_objset_rele(os, FTAG); /* if we failed, undo the @ that we tacked on to zc_name */ if (error != 0) *strchr(zc->zc_name, '@') = '\0'; return (error); } static int zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) { const char *propname = nvpair_name(pair); uint64_t *valary; unsigned int vallen; const char *dash, *domain; zfs_userquota_prop_t type; uint64_t rid; uint64_t quota; zfsvfs_t *zfsvfs; int err; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) != 0) return (SET_ERROR(EINVAL)); } /* * A correctly constructed propname is encoded as * userquota@-. */ if ((dash = strchr(propname, '-')) == NULL || nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || vallen != 3) return (SET_ERROR(EINVAL)); domain = dash + 1; type = valary[0]; rid = valary[1]; quota = valary[2]; err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); if (err == 0) { err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); zfsvfs_rele(zfsvfs, FTAG); } return (err); } /* * If the named property is one that has a special function to set its value, * return 0 on success and a positive error code on failure; otherwise if it is * not one of the special properties handled by this function, return -1. * * XXX: It would be better for callers of the property interface if we handled * these special cases in dsl_prop.c (in the dsl layer). */ static int zfs_prop_set_special(const char *dsname, zprop_source_t source, nvpair_t *pair) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval = 0; const char *strval = NULL; int err = -1; if (prop == ZPROP_USERPROP) { if (zfs_prop_userquota(propname)) return (zfs_prop_set_userquota(dsname, pair)); return (-1); } if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* all special properties are numeric except for keylocation */ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { strval = fnvpair_value_string(pair); } else { intval = fnvpair_value_uint64(pair); } switch (prop) { case ZFS_PROP_QUOTA: err = dsl_dir_set_quota(dsname, source, intval); break; case ZFS_PROP_REFQUOTA: err = dsl_dataset_set_refquota(dsname, source, intval); break; case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (intval == UINT64_MAX) { /* clearing the limit, just do it */ err = 0; } else { err = dsl_dir_activate_fs_ss_limit(dsname); } /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_KEYLOCATION: err = dsl_crypto_can_set_keylocation(dsname, strval); /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; case ZFS_PROP_REFRESERVATION: err = dsl_dataset_set_refreservation(dsname, source, intval); break; case ZFS_PROP_COMPRESSION: err = dsl_dataset_set_compression(dsname, source, intval); /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_VOLSIZE: err = zvol_set_volsize(dsname, intval); break; case ZFS_PROP_SNAPDEV: err = zvol_set_snapdev(dsname, source, intval); break; case ZFS_PROP_VOLMODE: err = zvol_set_volmode(dsname, source, intval); break; case ZFS_PROP_VERSION: { zfsvfs_t *zfsvfs; if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) break; err = zfs_set_version(zfsvfs, intval); zfsvfs_rele(zfsvfs, FTAG); if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { zfs_cmd_t *zc; zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strlcpy(zc->zc_name, dsname, sizeof (zc->zc_name)); (void) zfs_ioc_userspace_upgrade(zc); (void) zfs_ioc_id_quota_upgrade(zc); kmem_free(zc, sizeof (zfs_cmd_t)); } break; } default: err = -1; } return (err); } static boolean_t zfs_is_namespace_prop(zfs_prop_t prop) { switch (prop) { case ZFS_PROP_ATIME: case ZFS_PROP_RELATIME: case ZFS_PROP_DEVICES: case ZFS_PROP_EXEC: case ZFS_PROP_SETUID: case ZFS_PROP_READONLY: case ZFS_PROP_XATTR: case ZFS_PROP_NBMAND: return (B_TRUE); default: return (B_FALSE); } } /* * This function is best effort. If it fails to set any of the given properties, * it continues to set as many as it can and returns the last error * encountered. If the caller provides a non-NULL errlist, it will be filled in * with the list of names of all the properties that failed along with the * corresponding error numbers. * * If every property is set successfully, zero is returned and errlist is not * modified. */ int zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, nvlist_t *errlist) { nvpair_t *pair; nvpair_t *propval; int rv = 0; int err; uint64_t intval; const char *strval; boolean_t should_update_mount_cache = B_FALSE; nvlist_t *genericnvl = fnvlist_alloc(); nvlist_t *retrynvl = fnvlist_alloc(); retry: pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); err = 0; /* decode the property value */ propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &propval) != 0) err = SET_ERROR(EINVAL); } /* Validate value type */ if (err == 0 && source == ZPROP_SRC_INHERITED) { /* inherited properties are expected to be booleans */ if (nvpair_type(propval) != DATA_TYPE_BOOLEAN) err = SET_ERROR(EINVAL); } else if (err == 0 && prop == ZPROP_USERPROP) { if (zfs_prop_user(propname)) { if (nvpair_type(propval) != DATA_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (zfs_prop_userquota(propname)) { if (nvpair_type(propval) != DATA_TYPE_UINT64_ARRAY) err = SET_ERROR(EINVAL); } else { err = SET_ERROR(EINVAL); } } else if (err == 0) { if (nvpair_type(propval) == DATA_TYPE_STRING) { if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { const char *unused; intval = fnvpair_value_uint64(propval); switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: break; case PROP_TYPE_STRING: err = SET_ERROR(EINVAL); break; case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, intval, &unused) != 0) err = SET_ERROR(ZFS_ERR_BADPROP); break; default: cmn_err(CE_PANIC, "unknown property type"); } } else { err = SET_ERROR(EINVAL); } } /* Validate permissions */ if (err == 0) err = zfs_check_settable(dsname, pair, CRED()); if (err == 0) { if (source == ZPROP_SRC_INHERITED) err = -1; /* does not need special handling */ else err = zfs_prop_set_special(dsname, source, pair); if (err == -1) { /* * For better performance we build up a list of * properties to set in a single transaction. */ err = nvlist_add_nvpair(genericnvl, pair); } else if (err != 0 && nvl != retrynvl) { /* * This may be a spurious error caused by * receiving quota and reservation out of order. * Try again in a second pass. */ err = nvlist_add_nvpair(retrynvl, pair); } } if (err != 0) { if (errlist != NULL) fnvlist_add_int32(errlist, propname, err); rv = err; } if (zfs_is_namespace_prop(prop)) should_update_mount_cache = B_TRUE; } if (nvl != retrynvl && !nvlist_empty(retrynvl)) { nvl = retrynvl; goto retry; } if (nvlist_empty(genericnvl)) goto out; /* * Try to set them all in one batch. */ err = dsl_props_set(dsname, source, genericnvl); if (err == 0) goto out; /* * If batching fails, we still want to set as many properties as we * can, so try setting them individually. */ pair = NULL; while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { const char *propname = nvpair_name(pair); err = 0; propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); propval = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); } if (nvpair_type(propval) == DATA_TYPE_STRING) { strval = fnvpair_value_string(propval); err = dsl_prop_set_string(dsname, propname, source, strval); } else if (nvpair_type(propval) == DATA_TYPE_BOOLEAN) { err = dsl_prop_inherit(dsname, propname, source); } else { intval = fnvpair_value_uint64(propval); err = dsl_prop_set_int(dsname, propname, source, intval); } if (err != 0) { if (errlist != NULL) { fnvlist_add_int32(errlist, propname, err); } rv = err; } } out: if (should_update_mount_cache) zfs_ioctl_update_mount_cache(dsname); nvlist_free(genericnvl); nvlist_free(retrynvl); return (rv); } /* * Check that all the properties are valid user properties. */ static int zfs_check_userprops(nvlist_t *nvl) { nvpair_t *pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); if (!zfs_prop_user(propname) || nvpair_type(pair) != DATA_TYPE_STRING) return (SET_ERROR(EINVAL)); if (strlen(propname) >= ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN) return (SET_ERROR(E2BIG)); } return (0); } static void props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) { nvpair_t *pair; VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); pair = NULL; while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { if (nvlist_exists(skipped, nvpair_name(pair))) continue; VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); } } static int clear_received_props(const char *dsname, nvlist_t *props, nvlist_t *skipped) { int err = 0; nvlist_t *cleared_props = NULL; props_skip(props, skipped, &cleared_props); if (!nvlist_empty(cleared_props)) { /* * Acts on local properties until the dataset has received * properties at least once on or after SPA_VERSION_RECVD_PROPS. */ zprop_source_t flags = (ZPROP_SRC_NONE | (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); } nvlist_free(cleared_props); return (err); } /* * inputs: * zc_name name of filesystem * zc_value name of property to set * zc_nvlist_src{_size} nvlist of properties to apply * zc_cookie received properties flag * * outputs: * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_set_prop(zfs_cmd_t *zc) { nvlist_t *nvl; boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : ZPROP_SRC_LOCAL); nvlist_t *errors; int error; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvl)) != 0) return (error); if (received) { nvlist_t *origprops; if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { (void) clear_received_props(zc->zc_name, origprops, nvl); nvlist_free(origprops); } error = dsl_prop_set_hasrecvd(zc->zc_name); } errors = fnvlist_alloc(); if (error == 0) error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); if (zc->zc_nvlist_dst != 0 && errors != NULL) { (void) put_nvlist(zc, errors); } nvlist_free(errors); nvlist_free(nvl); return (error); } /* * inputs: * zc_name name of filesystem * zc_value name of property to inherit * zc_cookie revert to received value if TRUE * * outputs: none */ static int zfs_ioc_inherit_prop(zfs_cmd_t *zc) { const char *propname = zc->zc_value; zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_NONE /* revert to received value, if any */ : ZPROP_SRC_INHERITED); /* explicitly inherit */ nvlist_t *dummy; nvpair_t *pair; zprop_type_t type; int err; if (!received) { /* * Only check this in the non-received case. We want to allow * 'inherit -S' to revert non-inheritable properties like quota * and reservation to the received or default values even though * they are not considered inheritable. */ if (prop != ZPROP_USERPROP && !zfs_prop_inheritable(prop)) return (SET_ERROR(EINVAL)); } if (prop == ZPROP_USERPROP) { if (!zfs_prop_user(propname)) return (SET_ERROR(EINVAL)); type = PROP_TYPE_STRING; } else if (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION) { return (SET_ERROR(EINVAL)); } else { type = zfs_prop_get_type(prop); } /* * zfs_prop_set_special() expects properties in the form of an * nvpair with type info. */ dummy = fnvlist_alloc(); switch (type) { case PROP_TYPE_STRING: VERIFY(0 == nvlist_add_string(dummy, propname, "")); break; case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); break; default: err = SET_ERROR(EINVAL); goto errout; } pair = nvlist_next_nvpair(dummy, NULL); if (pair == NULL) { err = SET_ERROR(EINVAL); } else { err = zfs_prop_set_special(zc->zc_name, source, pair); if (err == -1) /* property is not "special", needs handling */ err = dsl_prop_inherit(zc->zc_name, zc->zc_value, source); } errout: nvlist_free(dummy); return (err); } static int zfs_ioc_pool_set_props(zfs_cmd_t *zc) { nvlist_t *props; spa_t *spa; int error; nvpair_t *pair; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) return (error); /* * If the only property is the configfile, then just do a spa_lookup() * to handle the faulted case. */ pair = nvlist_next_nvpair(props, NULL); if (pair != NULL && strcmp(nvpair_name(pair), zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && nvlist_next_nvpair(props, pair) == NULL) { mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE); } mutex_exit(&spa_namespace_lock); if (spa != NULL) { nvlist_free(props); return (0); } } if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { nvlist_free(props); return (error); } error = spa_prop_set(spa, props); nvlist_free(props); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_props(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *nvp = NULL; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) error = spa_prop_get(spa, &nvp); mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, &nvp); spa_close(spa, FTAG); } if (error == 0 && zc->zc_nvlist_dst != 0) error = put_nvlist(zc, nvp); else error = SET_ERROR(EFAULT); nvlist_free(nvp); return (error); } /* * innvl: { * "vdevprops_set_vdev" -> guid * "vdevprops_set_props" -> { prop -> value } * } * * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_vdev_set_props[] = { {ZPOOL_VDEV_PROPS_SET_VDEV, DATA_TYPE_UINT64, 0}, {ZPOOL_VDEV_PROPS_SET_PROPS, DATA_TYPE_NVLIST, 0} }; static int zfs_ioc_vdev_set_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; vdev_t *vd; uint64_t vdev_guid; /* Early validation */ if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); if (outnvl == NULL) return (SET_ERROR(EINVAL)); if ((error = spa_open(poolname, &spa, FTAG)) != 0) return (error); ASSERT(spa_writeable(spa)); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) { spa_close(spa, FTAG); return (SET_ERROR(ENOENT)); } error = vdev_prop_set(vd, innvl, outnvl); spa_close(spa, FTAG); return (error); } /* * innvl: { * "vdevprops_get_vdev" -> guid * (optional) "vdevprops_get_props" -> { propname -> propid } * } * * outnvl: propname -> value */ static const zfs_ioc_key_t zfs_keys_vdev_get_props[] = { {ZPOOL_VDEV_PROPS_GET_VDEV, DATA_TYPE_UINT64, 0}, {ZPOOL_VDEV_PROPS_GET_PROPS, DATA_TYPE_NVLIST, ZK_OPTIONAL} }; static int zfs_ioc_vdev_get_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; vdev_t *vd; uint64_t vdev_guid; /* Early validation */ if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); if (outnvl == NULL) return (SET_ERROR(EINVAL)); if ((error = spa_open(poolname, &spa, FTAG)) != 0) return (error); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) { spa_close(spa, FTAG); return (SET_ERROR(ENOENT)); } error = vdev_prop_get(vd, innvl, outnvl); spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_src{_size} nvlist of delegated permissions * zc_perm_action allow/unallow flag * * outputs: none */ static int zfs_ioc_set_fsacl(zfs_cmd_t *zc) { int error; nvlist_t *fsaclnv = NULL; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &fsaclnv)) != 0) return (error); /* * Verify nvlist is constructed correctly */ if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { nvlist_free(fsaclnv); return (SET_ERROR(EINVAL)); } /* * If we don't have PRIV_SYS_MOUNT, then validate * that user is allowed to hand out each permission in * the nvlist(s) */ error = secpolicy_zfs(CRED()); if (error != 0) { if (zc->zc_perm_action == B_FALSE) { error = dsl_deleg_can_allow(zc->zc_name, fsaclnv, CRED()); } else { error = dsl_deleg_can_unallow(zc->zc_name, fsaclnv, CRED()); } } if (error == 0) error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); nvlist_free(fsaclnv); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_nvlist_src{_size} nvlist of delegated permissions */ static int zfs_ioc_get_fsacl(zfs_cmd_t *zc) { nvlist_t *nvp; int error; if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { error = put_nvlist(zc, nvp); nvlist_free(nvp); } return (error); } static void zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; zfs_create_fs(os, cr, zct->zct_zplprops, tx); } #define ZFS_PROP_UNDEFINED ((uint64_t)-1) /* * inputs: * os parent objset pointer (NULL if root fs) * fuids_ok fuids allowed in this version of the spa? * sa_ok SAs allowed in this version of the spa? * createprops list of properties requested by creator * * outputs: * zplprops values for the zplprops we attach to the master node object * is_ci true if requested file system will be purely case-insensitive * * Determine the settings for utf8only, normalization and * casesensitivity. Specific values may have been requested by the * creator and/or we can inherit values from the parent dataset. If * the file system is of too early a vintage, a creator can not * request settings for these properties, even if the requested * setting is the default value. We don't actually want to create dsl * properties for these, so remove them from the source nvlist after * processing. */ static int zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; int error; ASSERT(zplprops != NULL); /* parent dataset must be a filesystem */ if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS) return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); /* * Pull out creator prop choices, if any. */ if (createprops) { (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_CASE), &sense); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_CASE)); } /* * If the zpl version requested is whacky or the file system * or pool is version is too "young" to support normalization * and the creator tried to set a value for one of the props, * error out. */ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || (zplver >= ZPL_VERSION_FUID && !fuids_ok) || (zplver >= ZPL_VERSION_SA && !sa_ok) || (zplver < ZPL_VERSION_NORMALIZATION && (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || sense != ZFS_PROP_UNDEFINED))) return (SET_ERROR(ENOTSUP)); /* * Put the version in the zplprops */ VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); if (norm == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); /* * If we're normalizing, names must always be valid UTF-8 strings. */ if (norm) u8 = 1; if (u8 == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); if (sense == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); if (is_ci) *is_ci = (sense == ZFS_CASE_INSENSITIVE); return (0); } static int zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok, sa_ok; uint64_t zplver = ZPL_VERSION; objset_t *os = NULL; char parentname[ZFS_MAX_DATASET_NAME_LEN]; spa_t *spa; uint64_t spa_vers; int error; zfs_get_parent(dataset, parentname, sizeof (parentname)); if ((error = spa_open(dataset, &spa, FTAG)) != 0) return (error); spa_vers = spa_version(spa); spa_close(spa, FTAG); zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); /* * Open parent object set so we can inherit zplprop values. */ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) return (error); error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); dmu_objset_rele(os, FTAG); return (error); } static int zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok; boolean_t sa_ok; uint64_t zplver = ZPL_VERSION; int error; zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); return (error); } /* * innvl: { * "type" -> dmu_objset_type_t (int32) * (optional) "props" -> { prop -> value } * (optional) "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * } * * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_create[] = { {"type", DATA_TYPE_INT32, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; zfs_creat_t zct = { 0 }; nvlist_t *nvprops = NULL; nvlist_t *hidden_args = NULL; void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); dmu_objset_type_t type; boolean_t is_insensitive = B_FALSE; dsl_crypto_params_t *dcp = NULL; type = (dmu_objset_type_t)fnvlist_lookup_int32(innvl, "type"); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); switch (type) { case DMU_OST_ZFS: cbfunc = zfs_create_cb; break; case DMU_OST_ZVOL: cbfunc = zvol_create_cb; break; default: cbfunc = NULL; break; } if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); zct.zct_props = nvprops; if (cbfunc == NULL) return (SET_ERROR(EINVAL)); if (type == DMU_OST_ZVOL) { uint64_t volsize, volblocksize; if (nvprops == NULL) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) return (SET_ERROR(EINVAL)); if ((error = nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize)) != 0 && error != ENOENT) return (SET_ERROR(EINVAL)); if (error != 0) volblocksize = zfs_prop_default_numeric( ZFS_PROP_VOLBLOCKSIZE); if ((error = zvol_check_volblocksize(fsname, volblocksize)) != 0 || (error = zvol_check_volsize(volsize, volblocksize)) != 0) return (error); } else if (type == DMU_OST_ZFS) { int error; /* * We have to have normalization and * case-folding flags correct when we do the * file system creation, so go figure them out * now. */ VERIFY(nvlist_alloc(&zct.zct_zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops(fsname, nvprops, zct.zct_zplprops, &is_insensitive); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } } error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, nvprops, hidden_args, &dcp); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } error = dmu_objset_create(fsname, type, is_insensitive ? DS_FLAG_CI_DATASET : 0, dcp, cbfunc, &zct); nvlist_free(zct.zct_zplprops); dsl_crypto_params_free(dcp, !!error); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) { spa_t *spa; int error2; /* * Volumes will return EBUSY and cannot be destroyed * until all asynchronous minor handling (e.g. from * setting the volmode property) has completed. Wait for * the spa_zvol_taskq to drain then retry. */ error2 = dsl_destroy_head(fsname); while ((error2 == EBUSY) && (type == DMU_OST_ZVOL)) { error2 = spa_open(fsname, &spa, FTAG); if (error2 == 0) { taskq_wait(spa->spa_zvol_taskq); spa_close(spa, FTAG); } error2 = dsl_destroy_head(fsname); } } } return (error); } /* * innvl: { * "origin" -> name of origin snapshot * (optional) "props" -> { prop -> value } * (optional) "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * } * * outputs: * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_clone[] = { {"origin", DATA_TYPE_STRING, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; nvlist_t *nvprops = NULL; const char *origin_name; origin_name = fnvlist_lookup_string(innvl, "origin"); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); if (dataset_namecheck(origin_name, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); error = dmu_objset_clone(fsname, origin_name); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) (void) dsl_destroy_head(fsname); } return (error); } static const zfs_ioc_key_t zfs_keys_remap[] = { /* no nvl keys */ }; static int zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { /* This IOCTL is no longer supported. */ (void) fsname, (void) innvl, (void) outnvl; return (0); } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional) "props" -> { prop -> value (string) } * } * * outnvl: snapshot -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_snapshot[] = { {"snaps", DATA_TYPE_NVLIST, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { nvlist_t *snaps; nvlist_t *props = NULL; int error, poollen; nvpair_t *pair; (void) nvlist_lookup_nvlist(innvl, "props", &props); if (!nvlist_empty(props) && zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) return (SET_ERROR(ENOTSUP)); if ((error = zfs_check_userprops(props)) != 0) return (error); snaps = fnvlist_lookup_nvlist(innvl, "snaps"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); char *cp = strchr(name, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The snap must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); /* * Check for permission to set the properties on the fs. */ if (!nvlist_empty(props)) { *cp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_USERPROP, CRED()); *cp = '@'; if (error != 0) return (error); } /* This must be the only snap of this fs. */ for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { if (strncmp(name, nvpair_name(pair2), cp - name + 1) == 0) { return (SET_ERROR(EXDEV)); } } } error = dsl_dataset_snapshot(snaps, props, outnvl); return (error); } /* * innvl: "message" -> string */ static const zfs_ioc_key_t zfs_keys_log_history[] = { {"message", DATA_TYPE_STRING, 0}, }; static int zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) { (void) unused, (void) outnvl; const char *message; char *poolname; spa_t *spa; int error; /* * The poolname in the ioctl is not set, we get it from the TSD, * which was set at the end of the last successful ioctl that allows * logging. The secpolicy func already checked that it is set. * Only one log ioctl is allowed after each successful ioctl, so * we clear the TSD here. */ poolname = tsd_get(zfs_allow_log_key); if (poolname == NULL) return (SET_ERROR(EINVAL)); (void) tsd_set(zfs_allow_log_key, NULL); error = spa_open(poolname, &spa, FTAG); kmem_strfree(poolname); if (error != 0) return (error); message = fnvlist_lookup_string(innvl, "message"); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } error = spa_history_log(spa, message); spa_close(spa, FTAG); return (error); } /* * This ioctl is used to set the bootenv configuration on the current * pool. This configuration is stored in the second padding area of the label, * and it is used by the bootloader(s) to store the bootloader and/or system * specific data. * The data is stored as nvlist data stream, and is protected by * an embedded checksum. * The version can have two possible values: * VB_RAW: nvlist should have key GRUB_ENVMAP, value DATA_TYPE_STRING. * VB_NVLIST: nvlist with arbitrary pairs. */ static const zfs_ioc_key_t zfs_keys_set_bootenv[] = { {"version", DATA_TYPE_UINT64, 0}, {"", DATA_TYPE_ANY, ZK_OPTIONAL | ZK_WILDCARDLIST}, }; static int zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int error; spa_t *spa; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); spa_vdev_state_enter(spa, SCL_ALL); error = vdev_label_write_bootenv(spa->spa_root_vdev, innvl); (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (error); } static const zfs_ioc_key_t zfs_keys_get_bootenv[] = { /* no nvl keys */ }; static int zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); spa_vdev_state_enter(spa, SCL_ALL); error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl); (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (error); } /* * The dp_config_rwlock must not be held when calling this, because the * unmount may need to write out data. * * This function is best-effort. Callers must deal gracefully if it * remains mounted (or is remounted after this call). * * Returns 0 if the argument is not a snapshot, or it is not currently a * filesystem, or we were able to unmount it. Returns error code otherwise. */ void zfs_unmount_snap(const char *snapname) { if (strchr(snapname, '@') == NULL) return; (void) zfsctl_snapshot_unmount(snapname, MNT_FORCE); } static int zfs_unmount_snap_cb(const char *snapname, void *arg) { (void) arg; zfs_unmount_snap(snapname); return (0); } /* * When a clone is destroyed, its origin may also need to be destroyed, * in which case it must be unmounted. This routine will do that unmount * if necessary. */ void zfs_destroy_unmount_origin(const char *fsname) { int error; objset_t *os; dsl_dataset_t *ds; error = dmu_objset_hold(fsname, FTAG, &os); if (error != 0) return; ds = dmu_objset_ds(os); if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { char originname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds->ds_prev, originname); dmu_objset_rele(os, FTAG); zfs_unmount_snap(originname); } else { dmu_objset_rele(os, FTAG); } } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional boolean) "defer" * } * * outnvl: snapshot -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = { {"snaps", DATA_TYPE_NVLIST, 0}, {"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; static int zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int poollen; nvlist_t *snaps; nvpair_t *pair; boolean_t defer; spa_t *spa; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); defer = nvlist_exists(innvl, "defer"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); /* * The snap must be in the specified pool to prevent the * invalid removal of zvol minors below. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); zfs_unmount_snap(nvpair_name(pair)); if (spa_open(name, &spa, FTAG) == 0) { zvol_remove_minors(spa, name, B_TRUE); spa_close(spa, FTAG); } } return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); } /* * Create bookmarks. The bookmark names are of the form #. * All bookmarks and snapshots must be in the same pool. * dsl_bookmark_create_nvl_validate describes the nvlist schema in more detail. * * innvl: { * new_bookmark1 -> existing_snapshot, * new_bookmark2 -> existing_bookmark, * } * * outnvl: bookmark -> error code (int32) * */ static const zfs_ioc_key_t zfs_keys_bookmark[] = { {"...", DATA_TYPE_STRING, ZK_WILDCARDLIST}, }; static int zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { (void) poolname; return (dsl_bookmark_create(innvl, outnvl)); } /* * innvl: { * property 1, property 2, ... * } * * outnvl: { * bookmark name 1 -> { property 1, property 2, ... }, * bookmark name 2 -> { property 1, property 2, ... } * } * */ static const zfs_ioc_key_t zfs_keys_get_bookmarks[] = { {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST | ZK_OPTIONAL}, }; static int zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { return (dsl_get_bookmarks(fsname, innvl, outnvl)); } /* * innvl is not used. * * outnvl: { * property 1, property 2, ... * } * */ static const zfs_ioc_key_t zfs_keys_get_bookmark_props[] = { /* no nvl keys */ }; static int zfs_ioc_get_bookmark_props(const char *bookmark, nvlist_t *innvl, nvlist_t *outnvl) { (void) innvl; char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *bmname; bmname = strchr(bookmark, '#'); if (bmname == NULL) return (SET_ERROR(EINVAL)); bmname++; (void) strlcpy(fsname, bookmark, sizeof (fsname)); *(strchr(fsname, '#')) = '\0'; return (dsl_get_bookmark_props(fsname, bmname, outnvl)); } /* * innvl: { * bookmark name 1, bookmark name 2 * } * * outnvl: bookmark -> error code (int32) * */ static const zfs_ioc_key_t zfs_keys_destroy_bookmarks[] = { {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST}, }; static int zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int error, poollen; poollen = strlen(poolname); for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { const char *name = nvpair_name(pair); const char *cp = strchr(name, '#'); /* * The bookmark name must contain an #, and the part after it * must contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The bookmark must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '#')) return (SET_ERROR(EXDEV)); } error = dsl_bookmark_destroy(innvl, outnvl); return (error); } static const zfs_ioc_key_t zfs_keys_channel_program[] = { {"program", DATA_TYPE_STRING, 0}, {"arg", DATA_TYPE_ANY, 0}, {"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, {"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { char *program; uint64_t instrlimit, memlimit; boolean_t sync_flag; nvpair_t *nvarg = NULL; program = fnvlist_lookup_string(innvl, ZCP_ARG_PROGRAM); if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) { sync_flag = B_TRUE; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) { instrlimit = ZCP_DEFAULT_INSTRLIMIT; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) { memlimit = ZCP_DEFAULT_MEMLIMIT; } nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST); if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit) return (SET_ERROR(EINVAL)); if (memlimit == 0 || memlimit > zfs_lua_max_memlimit) return (SET_ERROR(EINVAL)); return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit, nvarg, outnvl)); } /* * innvl: unused * outnvl: empty */ static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = { /* no nvl keys */ }; static int zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { (void) innvl, (void) outnvl; return (spa_checkpoint(poolname)); } /* * innvl: unused * outnvl: empty */ static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = { /* no nvl keys */ }; static int zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { (void) innvl, (void) outnvl; return (spa_checkpoint_discard(poolname)); } /* * inputs: * zc_name name of dataset to destroy * zc_defer_destroy mark for deferred destroy * * outputs: none */ static int zfs_ioc_destroy(zfs_cmd_t *zc) { objset_t *os; dmu_objset_type_t ost; int err; err = dmu_objset_hold(zc->zc_name, FTAG, &os); if (err != 0) return (err); ost = dmu_objset_type(os); dmu_objset_rele(os, FTAG); if (ost == DMU_OST_ZFS) zfs_unmount_snap(zc->zc_name); if (strchr(zc->zc_name, '@')) { err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); } else { err = dsl_destroy_head(zc->zc_name); if (err == EEXIST) { /* * It is possible that the given DS may have * hidden child (%recv) datasets - "leftovers" * resulting from the previously interrupted * 'zfs receive'. * * 6 extra bytes for /%recv */ char namebuf[ZFS_MAX_DATASET_NAME_LEN + 6]; if (snprintf(namebuf, sizeof (namebuf), "%s/%s", zc->zc_name, recv_clone_name) >= sizeof (namebuf)) return (SET_ERROR(EINVAL)); /* * Try to remove the hidden child (%recv) and after * that try to remove the target dataset. * If the hidden child (%recv) does not exist * the original error (EEXIST) will be returned */ err = dsl_destroy_head(namebuf); if (err == 0) err = dsl_destroy_head(zc->zc_name); else if (err == ENOENT) err = SET_ERROR(EEXIST); } } return (err); } /* * innvl: { * "initialize_command" -> POOL_INITIALIZE_{CANCEL|START|SUSPEND} (uint64) * "initialize_vdevs": { -> guids to initialize (nvlist) * "vdev_path_1": vdev_guid_1, (uint64), * "vdev_path_2": vdev_guid_2, (uint64), * ... * }, * } * * outnvl: { * "initialize_vdevs": { -> initialization errors (nvlist) * "vdev_path_1": errno, see function body for possible errnos (uint64) * "vdev_path_2": errno, ... (uint64) * ... * } * } * * EINVAL is returned for an unknown commands or if any of the provided vdev * guids have be specified with a type other than uint64. */ static const zfs_ioc_key_t zfs_keys_pool_initialize[] = { {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0}, {ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0} }; static int zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { uint64_t cmd_type; if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND, &cmd_type) != 0) { return (SET_ERROR(EINVAL)); } if (!(cmd_type == POOL_INITIALIZE_CANCEL || cmd_type == POOL_INITIALIZE_START || cmd_type == POOL_INITIALIZE_SUSPEND)) { return (SET_ERROR(EINVAL)); } nvlist_t *vdev_guids; if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS, &vdev_guids) != 0) { return (SET_ERROR(EINVAL)); } for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { uint64_t vdev_guid; if (nvpair_value_uint64(pair, &vdev_guid) != 0) { return (SET_ERROR(EINVAL)); } } spa_t *spa; int error = spa_open(poolname, &spa, FTAG); if (error != 0) return (error); nvlist_t *vdev_errlist = fnvlist_alloc(); int total_errors = spa_vdev_initialize(spa, vdev_guids, cmd_type, vdev_errlist); if (fnvlist_size(vdev_errlist) > 0) { fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS, vdev_errlist); } fnvlist_free(vdev_errlist); spa_close(spa, FTAG); return (total_errors > 0 ? SET_ERROR(EINVAL) : 0); } /* * innvl: { * "trim_command" -> POOL_TRIM_{CANCEL|START|SUSPEND} (uint64) * "trim_vdevs": { -> guids to TRIM (nvlist) * "vdev_path_1": vdev_guid_1, (uint64), * "vdev_path_2": vdev_guid_2, (uint64), * ... * }, * "trim_rate" -> Target TRIM rate in bytes/sec. * "trim_secure" -> Set to request a secure TRIM. * } * * outnvl: { * "trim_vdevs": { -> TRIM errors (nvlist) * "vdev_path_1": errno, see function body for possible errnos (uint64) * "vdev_path_2": errno, ... (uint64) * ... * } * } * * EINVAL is returned for an unknown commands or if any of the provided vdev * guids have be specified with a type other than uint64. */ static const zfs_ioc_key_t zfs_keys_pool_trim[] = { {ZPOOL_TRIM_COMMAND, DATA_TYPE_UINT64, 0}, {ZPOOL_TRIM_VDEVS, DATA_TYPE_NVLIST, 0}, {ZPOOL_TRIM_RATE, DATA_TYPE_UINT64, ZK_OPTIONAL}, {ZPOOL_TRIM_SECURE, DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, }; static int zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { uint64_t cmd_type; if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_COMMAND, &cmd_type) != 0) return (SET_ERROR(EINVAL)); if (!(cmd_type == POOL_TRIM_CANCEL || cmd_type == POOL_TRIM_START || cmd_type == POOL_TRIM_SUSPEND)) { return (SET_ERROR(EINVAL)); } nvlist_t *vdev_guids; if (nvlist_lookup_nvlist(innvl, ZPOOL_TRIM_VDEVS, &vdev_guids) != 0) return (SET_ERROR(EINVAL)); for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { uint64_t vdev_guid; if (nvpair_value_uint64(pair, &vdev_guid) != 0) { return (SET_ERROR(EINVAL)); } } /* Optional, defaults to maximum rate when not provided */ uint64_t rate; if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_RATE, &rate) != 0) rate = 0; /* Optional, defaults to standard TRIM when not provided */ boolean_t secure; if (nvlist_lookup_boolean_value(innvl, ZPOOL_TRIM_SECURE, &secure) != 0) { secure = B_FALSE; } spa_t *spa; int error = spa_open(poolname, &spa, FTAG); if (error != 0) return (error); nvlist_t *vdev_errlist = fnvlist_alloc(); int total_errors = spa_vdev_trim(spa, vdev_guids, cmd_type, rate, !!zfs_trim_metaslab_skip, secure, vdev_errlist); if (fnvlist_size(vdev_errlist) > 0) fnvlist_add_nvlist(outnvl, ZPOOL_TRIM_VDEVS, vdev_errlist); fnvlist_free(vdev_errlist); spa_close(spa, FTAG); return (total_errors > 0 ? SET_ERROR(EINVAL) : 0); } /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the * returned value "waited" is false. If there is activity in progress, and no * tag is passed in, the ioctl blocks until all activity of that type is * complete, and then returns with "waited" set to true. * * If a tag is provided, it identifies a particular instance of an activity to * wait for. Currently, this is only valid for use with 'initialize', because * that is the only activity for which there can be multiple instances running * concurrently. In the case of 'initialize', the tag corresponds to the guid of * the vdev on which to wait. * * If a thread waiting in the ioctl receives a signal, the call will return * immediately, and the return value will be EINTR. * * innvl: { * "wait_activity" -> int32_t * (optional) "wait_tag" -> uint64_t * } * * outnvl: "waited" -> boolean_t */ static const zfs_ioc_key_t zfs_keys_pool_wait[] = { {ZPOOL_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, {ZPOOL_WAIT_TAG, DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_wait(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int32_t activity; uint64_t tag; boolean_t waited; int error; if (nvlist_lookup_int32(innvl, ZPOOL_WAIT_ACTIVITY, &activity) != 0) return (EINVAL); if (nvlist_lookup_uint64(innvl, ZPOOL_WAIT_TAG, &tag) == 0) error = spa_wait_tag(name, activity, tag, &waited); else error = spa_wait(name, activity, &waited); if (error == 0) fnvlist_add_boolean_value(outnvl, ZPOOL_WAIT_WAITED, waited); return (error); } /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the * returned value "waited" is false. If there is activity in progress, and no * tag is passed in, the ioctl blocks until all activity of that type is * complete, and then returns with "waited" set to true. * * If a thread waiting in the ioctl receives a signal, the call will return * immediately, and the return value will be EINTR. * * innvl: { * "wait_activity" -> int32_t * } * * outnvl: "waited" -> boolean_t */ static const zfs_ioc_key_t zfs_keys_fs_wait[] = { {ZFS_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, }; static int zfs_ioc_wait_fs(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int32_t activity; boolean_t waited = B_FALSE; int error; dsl_pool_t *dp; dsl_dir_t *dd; dsl_dataset_t *ds; if (nvlist_lookup_int32(innvl, ZFS_WAIT_ACTIVITY, &activity) != 0) return (SET_ERROR(EINVAL)); if (activity >= ZFS_WAIT_NUM_ACTIVITIES || activity < 0) return (SET_ERROR(EINVAL)); if ((error = dsl_pool_hold(name, FTAG, &dp)) != 0) return (error); if ((error = dsl_dataset_hold(dp, name, FTAG, &ds)) != 0) { dsl_pool_rele(dp, FTAG); return (error); } dd = ds->ds_dir; mutex_enter(&dd->dd_activity_lock); dd->dd_activity_waiters++; /* * We get a long-hold here so that the dsl_dataset_t and dsl_dir_t * aren't evicted while we're waiting. Normally this is prevented by * holding the pool, but we can't do that while we're waiting since * that would prevent TXGs from syncing out. Some of the functionality * of long-holds (e.g. preventing deletion) is unnecessary for this * case, since we would cancel the waiters before proceeding with a * deletion. An alternative mechanism for keeping the dataset around * could be developed but this is simpler. */ dsl_dataset_long_hold(ds, FTAG); dsl_pool_rele(dp, FTAG); error = dsl_dir_wait(dd, ds, activity, &waited); dsl_dataset_long_rele(ds, FTAG); dd->dd_activity_waiters--; if (dd->dd_activity_waiters == 0) cv_signal(&dd->dd_activity_cv); mutex_exit(&dd->dd_activity_lock); dsl_dataset_rele(ds, FTAG); if (error == 0) fnvlist_add_boolean_value(outnvl, ZFS_WAIT_WAITED, waited); return (error); } /* * fsname is name of dataset to rollback (to most recent snapshot) * * innvl may contain name of expected target snapshot * * outnvl: "target" -> name of most recent snapshot * } */ static const zfs_ioc_key_t zfs_keys_rollback[] = { {"target", DATA_TYPE_STRING, ZK_OPTIONAL}, }; static int zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { zfsvfs_t *zfsvfs; zvol_state_handle_t *zv; char *target = NULL; int error; (void) nvlist_lookup_string(innvl, "target", &target); if (target != NULL) { const char *cp = strchr(target, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); } if (getzfsvfs(fsname, &zfsvfs) == 0) { dsl_dataset_t *ds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { int resume_err; error = dsl_dataset_rollback(fsname, target, zfsvfs, outnvl); resume_err = zfs_resume_fs(zfsvfs, ds); error = error ? error : resume_err; } zfs_vfs_rele(zfsvfs); } else if ((zv = zvol_suspend(fsname)) != NULL) { error = dsl_dataset_rollback(fsname, target, zvol_tag(zv), outnvl); zvol_resume(zv); } else { error = dsl_dataset_rollback(fsname, target, NULL, outnvl); } return (error); } static int recursive_unmount(const char *fsname, void *arg) { const char *snapname = arg; char *fullname; fullname = kmem_asprintf("%s@%s", fsname, snapname); zfs_unmount_snap(fullname); kmem_strfree(fullname); return (0); } /* * * snapname is the snapshot to redact. * innvl: { * "bookname" -> (string) * shortname of the redaction bookmark to generate * "snapnv" -> (nvlist, values ignored) * snapshots to redact snapname with respect to * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_redact[] = { {"bookname", DATA_TYPE_STRING, 0}, {"snapnv", DATA_TYPE_NVLIST, 0}, }; static int zfs_ioc_redact(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; nvlist_t *redactnvl = NULL; char *redactbook = NULL; if (nvlist_lookup_nvlist(innvl, "snapnv", &redactnvl) != 0) return (SET_ERROR(EINVAL)); if (fnvlist_num_pairs(redactnvl) == 0) return (SET_ERROR(ENXIO)); if (nvlist_lookup_string(innvl, "bookname", &redactbook) != 0) return (SET_ERROR(EINVAL)); return (dmu_redact_snap(snapname, redactnvl, redactbook)); } /* * inputs: * zc_name old name of dataset * zc_value new name of dataset * zc_cookie recursive flag (only valid for snapshots) * * outputs: none */ static int zfs_ioc_rename(zfs_cmd_t *zc) { objset_t *os; dmu_objset_type_t ost; boolean_t recursive = zc->zc_cookie & 1; boolean_t nounmount = !!(zc->zc_cookie & 2); char *at; int err; /* "zfs rename" from and to ...%recv datasets should both fail */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); err = dmu_objset_hold(zc->zc_name, FTAG, &os); if (err != 0) return (err); ost = dmu_objset_type(os); dmu_objset_rele(os, FTAG); at = strchr(zc->zc_name, '@'); if (at != NULL) { /* snaps must be in same fs */ int error; if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) return (SET_ERROR(EXDEV)); *at = '\0'; if (ost == DMU_OST_ZFS && !nounmount) { error = dmu_objset_find(zc->zc_name, recursive_unmount, at + 1, recursive ? DS_FIND_CHILDREN : 0); if (error != 0) { *at = '@'; return (error); } } error = dsl_dataset_rename_snapshot(zc->zc_name, at + 1, strchr(zc->zc_value, '@') + 1, recursive); *at = '@'; return (error); } else { return (dsl_dir_rename(zc->zc_name, zc->zc_value)); } } static int zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) { const char *propname = nvpair_name(pair); boolean_t issnap = (strchr(dsname, '@') != NULL); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval, compval; int err; if (prop == ZPROP_USERPROP) { if (zfs_prop_user(propname)) { if ((err = zfs_secpolicy_write_perms(dsname, ZFS_DELEG_PERM_USERPROP, cr))) return (err); return (0); } if (!issnap && zfs_prop_userquota(propname)) { const char *perm = NULL; const char *uq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; const char *gq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; const char *uiq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA]; const char *giq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA]; const char *pq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA]; const char *piq_prefix = zfs_userquota_prop_prefixes[\ ZFS_PROP_PROJECTOBJQUOTA]; if (strncmp(propname, uq_prefix, strlen(uq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USERQUOTA; } else if (strncmp(propname, uiq_prefix, strlen(uiq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USEROBJQUOTA; } else if (strncmp(propname, gq_prefix, strlen(gq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPQUOTA; } else if (strncmp(propname, giq_prefix, strlen(giq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPOBJQUOTA; } else if (strncmp(propname, pq_prefix, strlen(pq_prefix)) == 0) { perm = ZFS_DELEG_PERM_PROJECTQUOTA; } else if (strncmp(propname, piq_prefix, strlen(piq_prefix)) == 0) { perm = ZFS_DELEG_PERM_PROJECTOBJQUOTA; } else { /* {USER|GROUP|PROJECT}USED are read-only */ return (SET_ERROR(EINVAL)); } if ((err = zfs_secpolicy_write_perms(dsname, perm, cr))) return (err); return (0); } return (SET_ERROR(EINVAL)); } if (issnap) return (SET_ERROR(EINVAL)); if (nvpair_type(pair) == DATA_TYPE_NVLIST) { /* * dsl_prop_get_all_impl() returns properties in this * format. */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* * Check that this value is valid for this pool version */ switch (prop) { case ZFS_PROP_COMPRESSION: /* * If the user specified gzip compression, make sure * the SPA supports it. We ignore any errors here since * we'll catch them later. */ if (nvpair_value_uint64(pair, &intval) == 0) { compval = ZIO_COMPRESS_ALGO(intval); if (compval >= ZIO_COMPRESS_GZIP_1 && compval <= ZIO_COMPRESS_GZIP_9 && zfs_earlier_version(dsname, SPA_VERSION_GZIP_COMPRESSION)) { return (SET_ERROR(ENOTSUP)); } if (compval == ZIO_COMPRESS_ZLE && zfs_earlier_version(dsname, SPA_VERSION_ZLE_COMPRESSION)) return (SET_ERROR(ENOTSUP)); if (compval == ZIO_COMPRESS_LZ4) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } if (compval == ZIO_COMPRESS_ZSTD) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } } break; case ZFS_PROP_COPIES: if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_VOLBLOCKSIZE: case ZFS_PROP_RECORDSIZE: /* Record sizes above 128k need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (intval > zfs_max_recordsize || intval > SPA_MAXBLOCKSIZE) return (SET_ERROR(ERANGE)); if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_DNODESIZE: /* Dnode sizes above 512 need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval != ZFS_DNSIZE_LEGACY) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_SPECIAL_SMALL_BLOCKS: /* * This property could require the allocation classes * feature to be active for setting, however we allow * it so that tests of settable properties succeed. * The CLI will issue a warning in this case. */ break; case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_ACLINHERIT: if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { if (intval == ZFS_ACL_PASSTHROUGH_X && zfs_earlier_version(dsname, SPA_VERSION_PASSTHROUGH_X)) return (SET_ERROR(ENOTSUP)); } break; case ZFS_PROP_CHECKSUM: case ZFS_PROP_DEDUP: { spa_feature_t feature; spa_t *spa; int err; /* dedup feature version checks */ if (prop == ZFS_PROP_DEDUP && zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) return (SET_ERROR(ENOTSUP)); if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { /* check prop value is enabled in features */ feature = zio_checksum_to_feature( intval & ZIO_CHECKSUM_MASK); if (feature == SPA_FEATURE_NONE) break; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, feature)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; } default: break; } return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); } /* * Removes properties from the given props list that fail permission checks * needed to clear them and to restore them in case of a receive error. For each * property, make sure we have both set and inherit permissions. * * Returns the first error encountered if any permission checks fail. If the * caller provides a non-NULL errlist, it also gives the complete list of names * of all the properties that failed a permission check along with the * corresponding error numbers. The caller is responsible for freeing the * returned errlist. * * If every property checks out successfully, zero is returned and the list * pointed at by errlist is NULL. */ static int zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errlist) { zfs_cmd_t *zc; nvpair_t *pair, *next_pair; nvlist_t *errors; int err, rv = 0; if (props == NULL) return (0); VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strlcpy(zc->zc_name, dataset, sizeof (zc->zc_name)); pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { next_pair = nvlist_next_nvpair(props, pair); (void) strlcpy(zc->zc_value, nvpair_name(pair), sizeof (zc->zc_value)); if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { VERIFY(nvlist_remove_nvpair(props, pair) == 0); VERIFY(nvlist_add_int32(errors, zc->zc_value, err) == 0); } pair = next_pair; } kmem_free(zc, sizeof (zfs_cmd_t)); if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { nvlist_free(errors); errors = NULL; } else { VERIFY(nvpair_value_int32(pair, &rv) == 0); } if (errlist == NULL) nvlist_free(errors); else *errlist = errors; return (rv); } static boolean_t propval_equals(nvpair_t *p1, nvpair_t *p2) { if (nvpair_type(p1) == DATA_TYPE_NVLIST) { /* dsl_prop_get_all_impl() format */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p1) == 0); } if (nvpair_type(p2) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p2) == 0); } if (nvpair_type(p1) != nvpair_type(p2)) return (B_FALSE); if (nvpair_type(p1) == DATA_TYPE_STRING) { char *valstr1, *valstr2; VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0); VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0); return (strcmp(valstr1, valstr2) == 0); } else { uint64_t intval1, intval2; VERIFY(nvpair_value_uint64(p1, &intval1) == 0); VERIFY(nvpair_value_uint64(p2, &intval2) == 0); return (intval1 == intval2); } } /* * Remove properties from props if they are not going to change (as determined * by comparison with origprops). Remove them from origprops as well, since we * do not need to clear or restore properties that won't change. */ static void props_reduce(nvlist_t *props, nvlist_t *origprops) { nvpair_t *pair, *next_pair; if (origprops == NULL) return; /* all props need to be received */ pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { const char *propname = nvpair_name(pair); nvpair_t *match; next_pair = nvlist_next_nvpair(props, pair); if ((nvlist_lookup_nvpair(origprops, propname, &match) != 0) || !propval_equals(pair, match)) goto next; /* need to set received value */ /* don't clear the existing received value */ (void) nvlist_remove_nvpair(origprops, match); /* don't bother receiving the property */ (void) nvlist_remove_nvpair(props, pair); next: pair = next_pair; } } /* * Extract properties that cannot be set PRIOR to the receipt of a dataset. * For example, refquota cannot be set until after the receipt of a dataset, * because in replication streams, an older/earlier snapshot may exceed the * refquota. We want to receive the older/earlier snapshot, but setting * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent * the older/earlier snapshot from being received (with EDQUOT). * * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario. * * libzfs will need to be judicious handling errors encountered by props * extracted by this function. */ static nvlist_t * extract_delay_props(nvlist_t *props) { nvlist_t *delayprops; nvpair_t *nvp, *tmp; static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, ZFS_PROP_KEYLOCATION, /* * Setting ZFS_PROP_SHARESMB requires the objset type to be * known, which is not possible prior to receipt of raw sends. */ ZFS_PROP_SHARESMB, 0 }; int i; VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL; nvp = nvlist_next_nvpair(props, nvp)) { /* * strcmp() is safe because zfs_prop_to_name() always returns * a bounded string. */ for (i = 0; delayable[i] != 0; i++) { if (strcmp(zfs_prop_to_name(delayable[i]), nvpair_name(nvp)) == 0) { break; } } if (delayable[i] != 0) { tmp = nvlist_prev_nvpair(props, nvp); VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0); VERIFY(nvlist_remove_nvpair(props, nvp) == 0); nvp = tmp; } } if (nvlist_empty(delayprops)) { nvlist_free(delayprops); delayprops = NULL; } return (delayprops); } static void zfs_allow_log_destroy(void *arg) { char *poolname = arg; if (poolname != NULL) kmem_strfree(poolname); } #ifdef ZFS_DEBUG static boolean_t zfs_ioc_recv_inject_err; #endif /* * nvlist 'errors' is always allocated. It will contain descriptions of * encountered errors, if any. It's the callers responsibility to free. */ static int zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, nvlist_t *localprops, nvlist_t *hidden_args, boolean_t force, boolean_t heal, boolean_t resumable, int input_fd, dmu_replay_record_t *begin_record, uint64_t *read_bytes, uint64_t *errflags, nvlist_t **errors) { dmu_recv_cookie_t drc; int error = 0; int props_error = 0; offset_t off, noff; nvlist_t *local_delayprops = NULL; nvlist_t *recv_delayprops = NULL; nvlist_t *inherited_delayprops = NULL; nvlist_t *origprops = NULL; /* existing properties */ nvlist_t *origrecvd = NULL; /* existing received properties */ boolean_t first_recvd_props = B_FALSE; boolean_t tofs_was_redacted; zfs_file_t *input_fp; *read_bytes = 0; *errflags = 0; *errors = fnvlist_alloc(); off = 0; if ((input_fp = zfs_file_get(input_fd)) == NULL) return (SET_ERROR(EBADF)); noff = off = zfs_file_off(input_fp); error = dmu_recv_begin(tofs, tosnap, begin_record, force, heal, resumable, localprops, hidden_args, origin, &drc, input_fp, &off); if (error != 0) goto out; tofs_was_redacted = dsl_get_redacted(drc.drc_ds); /* * Set properties before we receive the stream so that they are applied * to the new data. Note that we must call dmu_recv_stream() if * dmu_recv_begin() succeeds. */ if (recvprops != NULL && !drc.drc_newfs) { if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= SPA_VERSION_RECVD_PROPS && !dsl_prop_get_hasrecvd(tofs)) first_recvd_props = B_TRUE; /* * If new received properties are supplied, they are to * completely replace the existing received properties, * so stash away the existing ones. */ if (dsl_prop_get_received(tofs, &origrecvd) == 0) { nvlist_t *errlist = NULL; /* * Don't bother writing a property if its value won't * change (and avoid the unnecessary security checks). * * The first receive after SPA_VERSION_RECVD_PROPS is a * special case where we blow away all local properties * regardless. */ if (!first_recvd_props) props_reduce(recvprops, origrecvd); if (zfs_check_clearable(tofs, origrecvd, &errlist) != 0) (void) nvlist_merge(*errors, errlist, 0); nvlist_free(errlist); if (clear_received_props(tofs, origrecvd, first_recvd_props ? NULL : recvprops) != 0) *errflags |= ZPROP_ERR_NOCLEAR; } else { *errflags |= ZPROP_ERR_NOCLEAR; } } /* * Stash away existing properties so we can restore them on error unless * we're doing the first receive after SPA_VERSION_RECVD_PROPS, in which * case "origrecvd" will take care of that. */ if (localprops != NULL && !drc.drc_newfs && !first_recvd_props) { objset_t *os; if (dmu_objset_hold(tofs, FTAG, &os) == 0) { if (dsl_prop_get_all(os, &origprops) != 0) { *errflags |= ZPROP_ERR_NOCLEAR; } dmu_objset_rele(os, FTAG); } else { *errflags |= ZPROP_ERR_NOCLEAR; } } if (recvprops != NULL) { props_error = dsl_prop_set_hasrecvd(tofs); if (props_error == 0) { recv_delayprops = extract_delay_props(recvprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, recvprops, *errors); } } if (localprops != NULL) { nvlist_t *oprops = fnvlist_alloc(); nvlist_t *xprops = fnvlist_alloc(); nvpair_t *nvp = NULL; while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) { if (nvpair_type(nvp) == DATA_TYPE_BOOLEAN) { /* -x property */ const char *name = nvpair_name(nvp); zfs_prop_t prop = zfs_name_to_prop(name); if (prop != ZPROP_USERPROP) { if (!zfs_prop_inheritable(prop)) continue; } else if (!zfs_prop_user(name)) continue; fnvlist_add_boolean(xprops, name); } else { /* -o property=value */ fnvlist_add_nvpair(oprops, nvp); } } local_delayprops = extract_delay_props(oprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, oprops, *errors); inherited_delayprops = extract_delay_props(xprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, xprops, *errors); nvlist_free(oprops); nvlist_free(xprops); } error = dmu_recv_stream(&drc, &off); if (error == 0) { zfsvfs_t *zfsvfs = NULL; zvol_state_handle_t *zv = NULL; if (getzfsvfs(tofs, &zfsvfs) == 0) { /* online recv */ dsl_dataset_t *ds; int end_err; boolean_t stream_is_redacted = DMU_GET_FEATUREFLAGS( begin_record->drr_u.drr_begin. drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); /* * If the suspend fails, then the recv_end will * likely also fail, and clean up after itself. */ end_err = dmu_recv_end(&drc, zfsvfs); /* * If the dataset was not redacted, but we received a * redacted stream onto it, we need to unmount the * dataset. Otherwise, resume the filesystem. */ if (error == 0 && !drc.drc_newfs && stream_is_redacted && !tofs_was_redacted) { error = zfs_end_fs(zfsvfs, ds); } else if (error == 0) { error = zfs_resume_fs(zfsvfs, ds); } error = error ? error : end_err; zfs_vfs_rele(zfsvfs); } else if ((zv = zvol_suspend(tofs)) != NULL) { error = dmu_recv_end(&drc, zvol_tag(zv)); zvol_resume(zv); } else { error = dmu_recv_end(&drc, NULL); } /* Set delayed properties now, after we're done receiving. */ if (recv_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, recv_delayprops, *errors); } if (local_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, local_delayprops, *errors); } if (inherited_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, inherited_delayprops, *errors); } } /* * Merge delayed props back in with initial props, in case * we're DEBUG and zfs_ioc_recv_inject_err is set (which means * we have to make sure clear_received_props() includes * the delayed properties). * * Since zfs_ioc_recv_inject_err is only in DEBUG kernels, * using ASSERT() will be just like a VERIFY. */ if (recv_delayprops != NULL) { ASSERT(nvlist_merge(recvprops, recv_delayprops, 0) == 0); nvlist_free(recv_delayprops); } if (local_delayprops != NULL) { ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0); nvlist_free(local_delayprops); } if (inherited_delayprops != NULL) { ASSERT(nvlist_merge(localprops, inherited_delayprops, 0) == 0); nvlist_free(inherited_delayprops); } *read_bytes = off - noff; #ifdef ZFS_DEBUG if (zfs_ioc_recv_inject_err) { zfs_ioc_recv_inject_err = B_FALSE; error = 1; } #endif /* * On error, restore the original props. */ if (error != 0 && recvprops != NULL && !drc.drc_newfs) { if (clear_received_props(tofs, recvprops, NULL) != 0) { /* * We failed to clear the received properties. * Since we may have left a $recvd value on the * system, we can't clear the $hasrecvd flag. */ *errflags |= ZPROP_ERR_NORESTORE; } else if (first_recvd_props) { dsl_prop_unset_hasrecvd(tofs); } if (origrecvd == NULL && !drc.drc_newfs) { /* We failed to stash the original properties. */ *errflags |= ZPROP_ERR_NORESTORE; } /* * dsl_props_set() will not convert RECEIVED to LOCAL on or * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL * explicitly if we're restoring local properties cleared in the * first new-style receive. */ if (origrecvd != NULL && zfs_set_prop_nvlist(tofs, (first_recvd_props ? ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), origrecvd, NULL) != 0) { /* * We stashed the original properties but failed to * restore them. */ *errflags |= ZPROP_ERR_NORESTORE; } } if (error != 0 && localprops != NULL && !drc.drc_newfs && !first_recvd_props) { nvlist_t *setprops; nvlist_t *inheritprops; nvpair_t *nvp; if (origprops == NULL) { /* We failed to stash the original properties. */ *errflags |= ZPROP_ERR_NORESTORE; goto out; } /* Restore original props */ setprops = fnvlist_alloc(); inheritprops = fnvlist_alloc(); nvp = NULL; while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) { const char *name = nvpair_name(nvp); const char *source; nvlist_t *attrs; if (!nvlist_exists(origprops, name)) { /* * Property was not present or was explicitly * inherited before the receive, restore this. */ fnvlist_add_boolean(inheritprops, name); continue; } attrs = fnvlist_lookup_nvlist(origprops, name); source = fnvlist_lookup_string(attrs, ZPROP_SOURCE); /* Skip received properties */ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) continue; if (strcmp(source, tofs) == 0) { /* Property was locally set */ fnvlist_add_nvlist(setprops, name, attrs); } else { /* Property was implicitly inherited */ fnvlist_add_boolean(inheritprops, name); } } if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, setprops, NULL) != 0) *errflags |= ZPROP_ERR_NORESTORE; if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, inheritprops, NULL) != 0) *errflags |= ZPROP_ERR_NORESTORE; nvlist_free(setprops); nvlist_free(inheritprops); } out: zfs_file_put(input_fp); nvlist_free(origrecvd); nvlist_free(origprops); if (error == 0) error = props_error; return (error); } /* * inputs: * zc_name name of containing filesystem (unused) * zc_nvlist_src{_size} nvlist of properties to apply * zc_nvlist_conf{_size} nvlist of properties to exclude * (DATA_TYPE_BOOLEAN) and override (everything else) * zc_value name of snapshot to create * zc_string name of clone origin (if DRR_FLAG_CLONE) * zc_cookie file descriptor to recv from * zc_begin_record the BEGIN record of the stream (not byteswapped) * zc_guid force flag * * outputs: * zc_cookie number of bytes read * zc_obj zprop_errflags_t * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_recv(zfs_cmd_t *zc) { dmu_replay_record_t begin_record; nvlist_t *errors = NULL; nvlist_t *recvdprops = NULL; nvlist_t *localprops = NULL; char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; int error = 0; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); (void) strlcpy(tofs, zc->zc_value, sizeof (tofs)); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &recvdprops)) != 0) return (error); if (zc->zc_nvlist_conf != 0 && (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &localprops)) != 0) return (error); if (zc->zc_string[0]) origin = zc->zc_string; begin_record.drr_type = DRR_BEGIN; begin_record.drr_payloadlen = 0; begin_record.drr_u.drr_begin = zc->zc_begin_record; error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops, NULL, zc->zc_guid, B_FALSE, B_FALSE, zc->zc_cookie, &begin_record, &zc->zc_cookie, &zc->zc_obj, &errors); nvlist_free(recvdprops); nvlist_free(localprops); /* * Now that all props, initial and delayed, are set, report the prop * errors to the caller. */ if (zc->zc_nvlist_dst_size != 0 && errors != NULL && (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || put_nvlist(zc, errors) != 0)) { /* * Caller made zc->zc_nvlist_dst less than the minimum expected * size or supplied an invalid address. */ error = SET_ERROR(EINVAL); } nvlist_free(errors); return (error); } /* * innvl: { * "snapname" -> full name of the snapshot to create * (optional) "props" -> received properties to set (nvlist) * (optional) "localprops" -> override and exclude properties (nvlist) * (optional) "origin" -> name of clone origin (DRR_FLAG_CLONE) * "begin_record" -> non-byteswapped dmu_replay_record_t * "input_fd" -> file descriptor to read stream from (int32) * (optional) "force" -> force flag (value ignored) * (optional) "heal" -> use send stream to heal data corruption * (optional) "resumable" -> resumable flag (value ignored) * (optional) "cleanup_fd" -> unused * (optional) "action_handle" -> unused * (optional) "hidden_args" -> { "wkeydata" -> value } * } * * outnvl: { * "read_bytes" -> number of bytes read * "error_flags" -> zprop_errflags_t * "errors" -> error for each unapplied received property (nvlist) * } */ static const zfs_ioc_key_t zfs_keys_recv_new[] = { {"snapname", DATA_TYPE_STRING, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"localprops", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"origin", DATA_TYPE_STRING, ZK_OPTIONAL}, {"begin_record", DATA_TYPE_BYTE_ARRAY, 0}, {"input_fd", DATA_TYPE_INT32, 0}, {"force", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"heal", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resumable", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"action_handle", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { dmu_replay_record_t *begin_record; uint_t begin_record_size; nvlist_t *errors = NULL; nvlist_t *recvprops = NULL; nvlist_t *localprops = NULL; nvlist_t *hidden_args = NULL; char *snapname; char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; boolean_t force; boolean_t heal; boolean_t resumable; uint64_t read_bytes = 0; uint64_t errflags = 0; int input_fd = -1; int error; snapname = fnvlist_lookup_string(innvl, "snapname"); if (dataset_namecheck(snapname, NULL, NULL) != 0 || strchr(snapname, '@') == NULL || strchr(snapname, '%')) return (SET_ERROR(EINVAL)); (void) strlcpy(tofs, snapname, sizeof (tofs)); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; error = nvlist_lookup_string(innvl, "origin", &origin); if (error && error != ENOENT) return (error); error = nvlist_lookup_byte_array(innvl, "begin_record", (uchar_t **)&begin_record, &begin_record_size); if (error != 0 || begin_record_size != sizeof (*begin_record)) return (SET_ERROR(EINVAL)); input_fd = fnvlist_lookup_int32(innvl, "input_fd"); force = nvlist_exists(innvl, "force"); heal = nvlist_exists(innvl, "heal"); resumable = nvlist_exists(innvl, "resumable"); /* we still use "props" here for backwards compatibility */ error = nvlist_lookup_nvlist(innvl, "props", &recvprops); if (error && error != ENOENT) return (error); error = nvlist_lookup_nvlist(innvl, "localprops", &localprops); if (error && error != ENOENT) return (error); error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); if (error && error != ENOENT) return (error); error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops, hidden_args, force, heal, resumable, input_fd, begin_record, &read_bytes, &errflags, &errors); fnvlist_add_uint64(outnvl, "read_bytes", read_bytes); fnvlist_add_uint64(outnvl, "error_flags", errflags); fnvlist_add_nvlist(outnvl, "errors", errors); nvlist_free(errors); nvlist_free(recvprops); nvlist_free(localprops); return (error); } typedef struct dump_bytes_io { zfs_file_t *dbi_fp; caddr_t dbi_buf; int dbi_len; int dbi_err; } dump_bytes_io_t; static void dump_bytes_cb(void *arg) { dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg; zfs_file_t *fp; caddr_t buf; fp = dbi->dbi_fp; buf = dbi->dbi_buf; dbi->dbi_err = zfs_file_write(fp, buf, dbi->dbi_len, NULL); } static int dump_bytes(objset_t *os, void *buf, int len, void *arg) { dump_bytes_io_t dbi; dbi.dbi_fp = arg; dbi.dbi_buf = buf; dbi.dbi_len = len; #if defined(HAVE_LARGE_STACKS) dump_bytes_cb(&dbi); #else /* * The vn_rdwr() call is performed in a taskq to ensure that there is * always enough stack space to write safely to the target filesystem. * The ZIO_TYPE_FREE threads are used because there can be a lot of * them and they are used in vdev_file.c for a similar purpose. */ spa_taskq_dispatch_sync(dmu_objset_spa(os), ZIO_TYPE_FREE, ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP); #endif /* HAVE_LARGE_STACKS */ return (dbi.dbi_err); } /* * inputs: * zc_name name of snapshot to send * zc_cookie file descriptor to send stream to * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) * zc_sendobj objsetid of snapshot to send * zc_fromobj objsetid of incremental fromsnap (may be zero) * zc_guid if set, estimate size of stream only. zc_cookie is ignored. * output size in zc_objset_type. * zc_flags lzc_send_flags * * outputs: * zc_objset_type estimated size, if zc_guid is set * * NOTE: This is no longer the preferred interface, any new functionality * should be added to zfs_ioc_send_new() instead. */ static int zfs_ioc_send(zfs_cmd_t *zc) { int error; offset_t off; boolean_t estimate = (zc->zc_guid != 0); boolean_t embedok = (zc->zc_flags & 0x1); boolean_t large_block_ok = (zc->zc_flags & 0x2); boolean_t compressok = (zc->zc_flags & 0x4); boolean_t rawok = (zc->zc_flags & 0x8); boolean_t savedok = (zc->zc_flags & 0x10); if (zc->zc_obj != 0) { dsl_pool_t *dp; dsl_dataset_t *tosnap; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (dsl_dir_is_clone(tosnap->ds_dir)) zc->zc_fromobj = dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj; dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } if (estimate) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (zc->zc_fromobj != 0) { error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } } error = dmu_send_estimate_fast(tosnap, fromsnap, NULL, compressok || rawok, savedok, &zc->zc_objset_type); if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } else { zfs_file_t *fp; dmu_send_outparams_t out = {0}; if ((fp = zfs_file_get(zc->zc_cookie)) == NULL) return (SET_ERROR(EBADF)); off = zfs_file_off(fp); out.dso_outfunc = dump_bytes; out.dso_arg = fp; out.dso_dryrun = B_FALSE; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, zc->zc_fromobj, embedok, large_block_ok, compressok, rawok, savedok, zc->zc_cookie, &off, &out); zfs_file_put(fp); } return (error); } /* * inputs: * zc_name name of snapshot on which to report progress * zc_cookie file descriptor of send stream * * outputs: * zc_cookie number of bytes written in send stream thus far * zc_objset_type logical size of data traversed by send thus far */ static int zfs_ioc_send_progress(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds; dmu_sendstatus_t *dsp = NULL; int error; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } mutex_enter(&ds->ds_sendstream_lock); /* * Iterate over all the send streams currently active on this dataset. * If there's one which matches the specified file descriptor _and_ the * stream was started by the current process, return the progress of * that stream. */ for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; dsp = list_next(&ds->ds_sendstreams, dsp)) { if (dsp->dss_outfd == zc->zc_cookie && zfs_proc_is_caller(dsp->dss_proc)) break; } if (dsp != NULL) { zc->zc_cookie = atomic_cas_64((volatile uint64_t *)dsp->dss_off, 0, 0); /* This is the closest thing we have to atomic_read_64. */ zc->zc_objset_type = atomic_cas_64(&dsp->dss_blocks, 0, 0); } else { error = SET_ERROR(ENOENT); } mutex_exit(&ds->ds_sendstream_lock); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static int zfs_ioc_inject_fault(zfs_cmd_t *zc) { int id, error; error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, &zc->zc_inject_record); if (error == 0) zc->zc_guid = (uint64_t)id; return (error); } static int zfs_ioc_clear_fault(zfs_cmd_t *zc) { return (zio_clear_fault((int)zc->zc_guid)); } static int zfs_ioc_inject_list_next(zfs_cmd_t *zc) { int id = (int)zc->zc_guid; int error; error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), &zc->zc_inject_record); zc->zc_guid = id; return (error); } static int zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; uint64_t count = zc->zc_nvlist_dst_size; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, &count); if (error == 0) zc->zc_nvlist_dst_size = count; else zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); spa_close(spa, FTAG); return (error); } static int zfs_ioc_clear(zfs_cmd_t *zc) { spa_t *spa; vdev_t *vd; int error; /* * On zpool clear we also fix up missing slogs */ mutex_enter(&spa_namespace_lock); spa = spa_lookup(zc->zc_name); if (spa == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EIO)); } if (spa_get_log_state(spa) == SPA_LOG_MISSING) { /* we need to let spa_open/spa_load clear the chains */ spa_set_log_state(spa, SPA_LOG_CLEAR); } spa->spa_last_open_failed = 0; mutex_exit(&spa_namespace_lock); if (zc->zc_cookie & ZPOOL_NO_REWIND) { error = spa_open(zc->zc_name, &spa, FTAG); } else { nvlist_t *policy; nvlist_t *config = NULL; if (zc->zc_nvlist_src == 0) return (SET_ERROR(EINVAL)); if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { error = spa_open_rewind(zc->zc_name, &spa, FTAG, policy, &config); if (config != NULL) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; nvlist_free(config); } nvlist_free(policy); } } if (error != 0) return (error); /* * If multihost is enabled, resuming I/O is unsafe as another * host may have imported the pool. */ if (spa_multihost(spa) && spa_suspended(spa)) return (SET_ERROR(EINVAL)); spa_vdev_state_enter(spa, SCL_NONE); if (zc->zc_guid == 0) { vd = NULL; } else { vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); if (vd == NULL) { error = SET_ERROR(ENODEV); (void) spa_vdev_state_exit(spa, NULL, error); spa_close(spa, FTAG); return (error); } } vdev_clear(spa, vd); (void) spa_vdev_state_exit(spa, spa_suspended(spa) ? NULL : spa->spa_root_vdev, 0); /* * Resume any suspended I/Os. */ if (zio_resume(spa) != 0) error = SET_ERROR(EIO); spa_close(spa, FTAG); return (error); } /* * Reopen all the vdevs associated with the pool. * * innvl: { * "scrub_restart" -> when true and scrub is running, allow to restart * scrub as the side effect of the reopen (boolean). * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_pool_reopen[] = { {"scrub_restart", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, }; static int zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; spa_t *spa; int error; boolean_t rc, scrub_restart = B_TRUE; if (innvl) { error = nvlist_lookup_boolean_value(innvl, "scrub_restart", &rc); if (error == 0) scrub_restart = rc; } error = spa_open(pool, &spa, FTAG); if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); /* * If the scrub_restart flag is B_FALSE and a scrub is already * in progress then set spa_scrub_reopen flag to B_TRUE so that * we don't restart the scrub as a side effect of the reopen. * Otherwise, let vdev_open() decided if a resilver is required. */ spa->spa_scrub_reopen = (!scrub_restart && dsl_scan_scrubbing(spa->spa_dsl_pool)); vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (0); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_string name of conflicting snapshot, if there is one */ static int zfs_ioc_promote(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds, *ods; char origin[ZFS_MAX_DATASET_NAME_LEN]; char *cp; int error; zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || strchr(zc->zc_name, '%')) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dir_is_clone(ds->ds_dir)) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods); if (error != 0) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ods, origin); dsl_dataset_rele(ods, FTAG); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); /* * We don't need to unmount *all* the origin fs's snapshots, but * it's easier. */ cp = strchr(origin, '@'); if (cp) *cp = '\0'; (void) dmu_objset_find(origin, zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); } /* * Retrieve a single {user|group|project}{used|quota}@... property. * * inputs: * zc_name name of filesystem * zc_objset_type zfs_userquota_prop_t * zc_value domain name (eg. "S-1-234-567-89") * zc_guid RID/UID/GID * * outputs: * zc_cookie property value */ static int zfs_ioc_userspace_one(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int error; if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); error = zfs_userspace_one(zfsvfs, zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_objset_type zfs_userquota_prop_t * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) * * outputs: * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) * zc_cookie zap cursor */ static int zfs_ioc_userspace_many(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int bufsize = zc->zc_nvlist_dst_size; if (bufsize <= 0) return (SET_ERROR(ENOMEM)); int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); void *buf = vmem_alloc(bufsize, KM_SLEEP); error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, buf, &zc->zc_nvlist_dst_size); if (error == 0) { error = xcopyout(buf, (void *)(uintptr_t)zc->zc_nvlist_dst, zc->zc_nvlist_dst_size); } vmem_free(buf, bufsize); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) { int error = 0; zfsvfs_t *zfsvfs; if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { if (!dmu_objset_userused_enabled(zfsvfs->z_os)) { /* * If userused is not enabled, it may be because the * objset needs to be closed & reopened (to grow the * objset_phys_t). Suspend/resume the fs will do that. */ dsl_dataset_t *ds, *newds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { dmu_objset_refresh_ownership(ds, &newds, B_TRUE, zfsvfs); error = zfs_resume_fs(zfsvfs, newds); } } if (error == 0) { mutex_enter(&zfsvfs->z_os->os_upgrade_lock); if (zfsvfs->z_os->os_upgrade_id == 0) { /* clear potential error code and retry */ zfsvfs->z_os->os_upgrade_status = 0; mutex_exit(&zfsvfs->z_os->os_upgrade_lock); dsl_pool_config_enter( dmu_objset_pool(zfsvfs->z_os), FTAG); dmu_objset_userspace_upgrade(zfsvfs->z_os); dsl_pool_config_exit( dmu_objset_pool(zfsvfs->z_os), FTAG); } else { mutex_exit(&zfsvfs->z_os->os_upgrade_lock); } taskq_wait_id(zfsvfs->z_os->os_spa->spa_upgrade_taskq, zfsvfs->z_os->os_upgrade_id); error = zfsvfs->z_os->os_upgrade_status; } zfs_vfs_rele(zfsvfs); } else { objset_t *os; /* XXX kind of reading contents without owning */ error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_id == 0) { /* clear potential error code and retry */ os->os_upgrade_status = 0; mutex_exit(&os->os_upgrade_lock); dmu_objset_userspace_upgrade(os); } else { mutex_exit(&os->os_upgrade_lock); } dsl_pool_rele(dmu_objset_pool(os), FTAG); taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id); error = os->os_upgrade_status; dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG); } return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); if (dmu_objset_userobjspace_upgradable(os) || dmu_objset_projectquota_upgradable(os)) { mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_id == 0) { /* clear potential error code and retry */ os->os_upgrade_status = 0; mutex_exit(&os->os_upgrade_lock); dmu_objset_id_quota_upgrade(os); } else { mutex_exit(&os->os_upgrade_lock); } dsl_pool_rele(dmu_objset_pool(os), FTAG); taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id); error = os->os_upgrade_status; } else { dsl_pool_rele(dmu_objset_pool(os), FTAG); } dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } static int zfs_ioc_share(zfs_cmd_t *zc) { return (SET_ERROR(ENOSYS)); } /* * inputs: * zc_name name of containing filesystem * zc_obj object # beyond which we want next in-use object # * * outputs: * zc_obj next in-use object # */ static int zfs_ioc_next_obj(zfs_cmd_t *zc) { objset_t *os = NULL; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) return (error); error = dmu_object_next(os, &zc->zc_obj, B_FALSE, 0); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_value prefix name for snapshot * zc_cleanup_fd cleanup-on-exit file descriptor for calling process * * outputs: * zc_value short name of new snapshot */ static int zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) { char *snap_name; char *hold_name; minor_t minor; zfs_file_t *fp = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); if (fp == NULL) return (SET_ERROR(EBADF)); snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, (u_longlong_t)ddi_get_lbolt64()); hold_name = kmem_asprintf("%%%s", zc->zc_value); int error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, hold_name); if (error == 0) (void) strlcpy(zc->zc_value, snap_name, sizeof (zc->zc_value)); kmem_strfree(snap_name); kmem_strfree(hold_name); zfs_onexit_fd_rele(fp); return (error); } /* * inputs: * zc_name name of "to" snapshot * zc_value name of "from" snapshot * zc_cookie file descriptor to write diff data on * * outputs: * dmu_diff_record_t's to the file descriptor */ static int zfs_ioc_diff(zfs_cmd_t *zc) { zfs_file_t *fp; offset_t off; int error; if ((fp = zfs_file_get(zc->zc_cookie)) == NULL) return (SET_ERROR(EBADF)); off = zfs_file_off(fp); error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off); zfs_file_put(fp); return (error); } static int zfs_ioc_smb_acl(zfs_cmd_t *zc) { return (SET_ERROR(ENOTSUP)); } /* * innvl: { * "holds" -> { snapname -> holdname (string), ... } * (optional) "cleanup_fd" -> fd (int32) * } * * outnvl: { * snapname -> error value (int32) * ... * } */ static const zfs_ioc_key_t zfs_keys_hold[] = { {"holds", DATA_TYPE_NVLIST, 0}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, }; static int zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) { (void) pool; nvpair_t *pair; nvlist_t *holds; int cleanup_fd = -1; int error; minor_t minor = 0; zfs_file_t *fp = NULL; holds = fnvlist_lookup_nvlist(args, "holds"); /* make sure the user didn't pass us any invalid (empty) tags */ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char *htag; error = nvpair_value_string(pair, &htag); if (error != 0) return (SET_ERROR(error)); if (strlen(htag) == 0) return (SET_ERROR(EINVAL)); } if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { fp = zfs_onexit_fd_hold(cleanup_fd, &minor); if (fp == NULL) return (SET_ERROR(EBADF)); } error = dsl_dataset_user_hold(holds, minor, errlist); if (fp != NULL) { ASSERT3U(minor, !=, 0); zfs_onexit_fd_rele(fp); } return (SET_ERROR(error)); } /* * innvl is not used. * * outnvl: { * holdname -> time added (uint64 seconds since epoch) * ... * } */ static const zfs_ioc_key_t zfs_keys_get_holds[] = { /* no nvl keys */ }; static int zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) { (void) args; return (dsl_dataset_get_holds(snapname, outnvl)); } /* * innvl: { * snapname -> { holdname, ... } * ... * } * * outnvl: { * snapname -> error value (int32) * ... * } */ static const zfs_ioc_key_t zfs_keys_release[] = { {"...", DATA_TYPE_NVLIST, ZK_WILDCARDLIST}, }; static int zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) { (void) pool; return (dsl_dataset_user_release(holds, errlist)); } /* * inputs: * zc_guid flags (ZEVENT_NONBLOCK) * zc_cleanup_fd zevent file descriptor * * outputs: * zc_nvlist_dst next nvlist event * zc_cookie dropped events since last get */ static int zfs_ioc_events_next(zfs_cmd_t *zc) { zfs_zevent_t *ze; nvlist_t *event = NULL; minor_t minor; uint64_t dropped = 0; int error; zfs_file_t *fp = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); if (fp == NULL) return (SET_ERROR(EBADF)); do { error = zfs_zevent_next(ze, &event, &zc->zc_nvlist_dst_size, &dropped); if (event != NULL) { zc->zc_cookie = dropped; error = put_nvlist(zc, event); nvlist_free(event); } if (zc->zc_guid & ZEVENT_NONBLOCK) break; if ((error == 0) || (error != ENOENT)) break; error = zfs_zevent_wait(ze); if (error != 0) break; } while (1); zfs_zevent_fd_rele(fp); return (error); } /* * outputs: * zc_cookie cleared events count */ static int zfs_ioc_events_clear(zfs_cmd_t *zc) { int count; zfs_zevent_drain_all(&count); zc->zc_cookie = count; return (0); } /* * inputs: * zc_guid eid | ZEVENT_SEEK_START | ZEVENT_SEEK_END * zc_cleanup zevent file descriptor */ static int zfs_ioc_events_seek(zfs_cmd_t *zc) { zfs_zevent_t *ze; minor_t minor; int error; zfs_file_t *fp = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); if (fp == NULL) return (SET_ERROR(EBADF)); error = zfs_zevent_seek(ze, zc->zc_guid); zfs_zevent_fd_rele(fp); return (error); } /* * inputs: * zc_name name of later filesystem or snapshot * zc_value full name of old snapshot or bookmark * * outputs: * zc_cookie space in bytes * zc_objset_type compressed space in bytes * zc_perm_action uncompressed space in bytes */ static int zfs_ioc_space_written(zfs_cmd_t *zc) { int error; dsl_pool_t *dp; dsl_dataset_t *new; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (strchr(zc->zc_value, '#') != NULL) { zfs_bookmark_phys_t bmp; error = dsl_bookmark_lookup(dp, zc->zc_value, new, &bmp); if (error == 0) { error = dsl_dataset_space_written_bookmark(&bmp, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); } } else { dsl_dataset_t *old; error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); if (error == 0) { error = dsl_dataset_space_written(old, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); dsl_dataset_rele(old, FTAG); } } dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * innvl: { * "firstsnap" -> snapshot name * } * * outnvl: { * "used" -> space in bytes * "compressed" -> compressed space in bytes * "uncompressed" -> uncompressed space in bytes * } */ static const zfs_ioc_key_t zfs_keys_space_snaps[] = { {"firstsnap", DATA_TYPE_STRING, 0}, }; static int zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) { int error; dsl_pool_t *dp; dsl_dataset_t *new, *old; char *firstsnap; uint64_t used, comp, uncomp; firstsnap = fnvlist_lookup_string(innvl, "firstsnap"); error = dsl_pool_hold(lastsnap, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); if (error == 0 && !new->ds_is_snapshot) { dsl_dataset_rele(new, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); if (error == 0 && !old->ds_is_snapshot) { dsl_dataset_rele(old, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); fnvlist_add_uint64(outnvl, "used", used); fnvlist_add_uint64(outnvl, "compressed", comp); fnvlist_add_uint64(outnvl, "uncompressed", uncomp); return (error); } /* * innvl: { * "fd" -> file descriptor to write stream to (int32) * (optional) "fromsnap" -> full snap name to send an incremental from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. * (optional) "savedok" -> (value ignored) * presence indicates we should send a partially received snapshot * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * (optional) "redactbook" -> (string) * if present, use this bookmark's redaction list to generate a redacted * send stream * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_send_new[] = { {"fd", DATA_TYPE_INT32, 0}, {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"savedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, }; static int zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; int error; offset_t off; char *fromname = NULL; int fd; zfs_file_t *fp; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; boolean_t rawok; boolean_t savedok; uint64_t resumeobj = 0; uint64_t resumeoff = 0; char *redactbook = NULL; fd = fnvlist_lookup_int32(innvl, "fd"); (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); savedok = nvlist_exists(innvl, "savedok"); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); (void) nvlist_lookup_string(innvl, "redactbook", &redactbook); if ((fp = zfs_file_get(fd)) == NULL) return (SET_ERROR(EBADF)); off = zfs_file_off(fp); dmu_send_outparams_t out = {0}; out.dso_outfunc = dump_bytes; out.dso_arg = fp; out.dso_dryrun = B_FALSE; error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactbook, fd, &off, &out); zfs_file_put(fp); return (error); } static int send_space_sum(objset_t *os, void *buf, int len, void *arg) { (void) os, (void) buf; uint64_t *size = arg; *size += len; return (0); } /* * Determine approximately how large a zfs send stream will be -- the number * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). * * innvl: { * (optional) "from" -> full snap or bookmark name to send an incremental * from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * (optional) "fd" -> file descriptor to use as a cookie for progress * tracking (int32) * } * * outnvl: { * "space" -> bytes of space (uint64) * } */ static const zfs_ioc_key_t zfs_keys_send_space[] = { {"from", DATA_TYPE_STRING, ZK_OPTIONAL}, {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"bytes", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; int error; char *fromname = NULL; char *redactlist_book = NULL; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; boolean_t rawok; boolean_t savedok; uint64_t space = 0; boolean_t full_estimate = B_FALSE; uint64_t resumeobj = 0; uint64_t resumeoff = 0; uint64_t resume_bytes = 0; int32_t fd = -1; zfs_bookmark_phys_t zbm = {0}; error = dsl_pool_hold(snapname, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } (void) nvlist_lookup_int32(innvl, "fd", &fd); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); savedok = nvlist_exists(innvl, "savedok"); boolean_t from = (nvlist_lookup_string(innvl, "from", &fromname) == 0); boolean_t altbook = (nvlist_lookup_string(innvl, "redactbook", &redactlist_book) == 0); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); (void) nvlist_lookup_uint64(innvl, "bytes", &resume_bytes); if (altbook) { full_estimate = B_TRUE; } else if (from) { if (strchr(fromname, '#')) { error = dsl_bookmark_lookup(dp, fromname, tosnap, &zbm); /* * dsl_bookmark_lookup() will fail with EXDEV if * the from-bookmark and tosnap are at the same txg. * However, it's valid to do a send (and therefore, * a send estimate) from and to the same time point, * if the bookmark is redacted (the incremental send * can change what's redacted on the target). In * this case, dsl_bookmark_lookup() fills in zbm * but returns EXDEV. Ignore this error. */ if (error == EXDEV && zbm.zbm_redaction_obj != 0 && zbm.zbm_guid == dsl_dataset_phys(tosnap)->ds_guid) error = 0; if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } if (zbm.zbm_redaction_obj != 0 || !(zbm.zbm_flags & ZBM_FLAG_HAS_FBN)) { full_estimate = B_TRUE; } } else if (strchr(fromname, '@')) { error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) { full_estimate = B_TRUE; dsl_dataset_rele(fromsnap, FTAG); } } else { /* * from is not properly formatted as a snapshot or * bookmark */ dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EINVAL)); } } if (full_estimate) { dmu_send_outparams_t out = {0}; offset_t off = 0; out.dso_outfunc = send_space_sum; out.dso_arg = &space; out.dso_dryrun = B_TRUE; /* * We have to release these holds so dmu_send can take them. It * will do all the error checking we need. */ dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactlist_book, fd, &off, &out); } else { error = dmu_send_estimate_fast(tosnap, fromsnap, (from && strchr(fromname, '#') != NULL ? &zbm : NULL), compressok || rawok, savedok, &space); space -= resume_bytes; if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } fnvlist_add_uint64(outnvl, "space", space); return (error); } /* * Sync the currently open TXG to disk for the specified pool. * This is somewhat similar to 'zfs_sync()'. * For cases that do not result in error this ioctl will wait for * the currently open TXG to commit before returning back to the caller. * * innvl: { * "force" -> when true, force uberblock update even if there is no dirty data. * In addition this will cause the vdev configuration to be written * out including updating the zpool cache file. (boolean_t) * } * * onvl is unused */ static const zfs_ioc_key_t zfs_keys_pool_sync[] = { {"force", DATA_TYPE_BOOLEAN_VALUE, 0}, }; static int zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) { (void) onvl; int err; boolean_t rc, force = B_FALSE; spa_t *spa; if ((err = spa_open(pool, &spa, FTAG)) != 0) return (err); if (innvl) { err = nvlist_lookup_boolean_value(innvl, "force", &rc); if (err == 0) force = rc; } if (force) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER); vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } txg_wait_synced(spa_get_dsl(spa), 0); spa_close(spa, FTAG); return (0); } /* * Load a user's wrapping key into the kernel. * innvl: { * "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * (optional) "noop" -> (value ignored) * presence indicated key should only be verified, not loaded * } */ static const zfs_ioc_key_t zfs_keys_load_key[] = { {"hidden_args", DATA_TYPE_NVLIST, 0}, {"noop", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; static int zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; int ret; dsl_crypto_params_t *dcp = NULL; nvlist_t *hidden_args; boolean_t noop = nvlist_exists(innvl, "noop"); if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = SET_ERROR(EINVAL); goto error; } hidden_args = fnvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS); ret = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, hidden_args, &dcp); if (ret != 0) goto error; ret = spa_keystore_load_wkey(dsname, dcp, noop); if (ret != 0) goto error; dsl_crypto_params_free(dcp, noop); return (0); error: dsl_crypto_params_free(dcp, B_TRUE); return (ret); } /* * Unload a user's wrapping key from the kernel. * Both innvl and outnvl are unused. */ static const zfs_ioc_key_t zfs_keys_unload_key[] = { /* no nvl keys */ }; static int zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { (void) innvl, (void) outnvl; int ret = 0; if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = (SET_ERROR(EINVAL)); goto out; } ret = spa_keystore_unload_wkey(dsname); if (ret != 0) goto out; out: return (ret); } /* * Changes a user's wrapping key used to decrypt a dataset. The keyformat, * keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified * here to change how the key is derived in userspace. * * innvl: { * "hidden_args" (optional) -> { "wkeydata" -> value } * raw uint8_t array of new encryption wrapping key data (32 bytes) * "props" (optional) -> { prop -> value } * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_change_key[] = { {"crypt_cmd", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; int ret; uint64_t cmd = DCP_CMD_NONE; dsl_crypto_params_t *dcp = NULL; nvlist_t *args = NULL, *hidden_args = NULL; if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = (SET_ERROR(EINVAL)); goto error; } (void) nvlist_lookup_uint64(innvl, "crypt_cmd", &cmd); (void) nvlist_lookup_nvlist(innvl, "props", &args); (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); ret = dsl_crypto_params_create_nvlist(cmd, args, hidden_args, &dcp); if (ret != 0) goto error; ret = spa_keystore_change_key(dsname, dcp); if (ret != 0) goto error; dsl_crypto_params_free(dcp, B_FALSE); return (0); error: dsl_crypto_params_free(dcp, B_TRUE); return (ret); } static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; static void zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); vec->zvec_legacy_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_allow_log = log_history; vec->zvec_pool_check = pool_check; } /* * See the block comment at the beginning of this file for details on * each argument to this function. */ void zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, boolean_t allow_log, const zfs_ioc_key_t *nvl_keys, size_t num_keys) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); /* if we are logging, the name must be valid */ ASSERT(!allow_log || namecheck != NO_NAME); vec->zvec_name = name; vec->zvec_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_pool_check = pool_check; vec->zvec_smush_outnvlist = smush_outnvlist; vec->zvec_allow_log = allow_log; vec->zvec_nvl_keys = nvl_keys; vec->zvec_nvl_key_count = num_keys; } static void zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, POOL_NAME, log_history, pool_check); } void zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, pool_check); } static void zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, NO_NAME, B_FALSE, POOL_CHECK_NONE); } static void zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); } static void zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_dataset_read_secpolicy(ioc, func, zfs_secpolicy_read); } static void zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_init(void) { zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_snapshot, ARRAY_SIZE(zfs_keys_snapshot)); zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_log_history, ARRAY_SIZE(zfs_keys_log_history)); zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_space_snaps, ARRAY_SIZE(zfs_keys_space_snaps)); zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_send_new, ARRAY_SIZE(zfs_keys_send_new)); zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_send_space, ARRAY_SIZE(zfs_keys_send_space)); zfs_ioctl_register("create", ZFS_IOC_CREATE, zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_create, ARRAY_SIZE(zfs_keys_create)); zfs_ioctl_register("clone", ZFS_IOC_CLONE, zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone)); zfs_ioctl_register("remap", ZFS_IOC_REMAP, zfs_ioc_remap, zfs_secpolicy_none, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap)); zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_destroy_snaps, ARRAY_SIZE(zfs_keys_destroy_snaps)); zfs_ioctl_register("hold", ZFS_IOC_HOLD, zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_hold, ARRAY_SIZE(zfs_keys_hold)); zfs_ioctl_register("release", ZFS_IOC_RELEASE, zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_release, ARRAY_SIZE(zfs_keys_release)); zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_holds, ARRAY_SIZE(zfs_keys_get_holds)); zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_rollback, ARRAY_SIZE(zfs_keys_rollback)); zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK, zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_bookmark, ARRAY_SIZE(zfs_keys_bookmark)); zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS, zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks)); zfs_ioctl_register("get_bookmark_props", ZFS_IOC_GET_BOOKMARK_PROPS, zfs_ioc_get_bookmark_props, zfs_secpolicy_read, ENTITY_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmark_props, ARRAY_SIZE(zfs_keys_get_bookmark_props)); zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_destroy_bookmarks, ARRAY_SIZE(zfs_keys_destroy_bookmarks)); zfs_ioctl_register("receive", ZFS_IOC_RECV_NEW, zfs_ioc_recv_new, zfs_secpolicy_recv, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_recv_new, ARRAY_SIZE(zfs_keys_recv_new)); zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY, zfs_ioc_load_key, zfs_secpolicy_load_key, DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_load_key, ARRAY_SIZE(zfs_keys_load_key)); zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY, zfs_ioc_unload_key, zfs_secpolicy_load_key, DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_unload_key, ARRAY_SIZE(zfs_keys_unload_key)); zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY, zfs_ioc_change_key, zfs_secpolicy_change_key, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_change_key, ARRAY_SIZE(zfs_keys_change_key)); zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC, zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync)); zfs_ioctl_register("reopen", ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_pool_reopen, ARRAY_SIZE(zfs_keys_pool_reopen)); zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM, zfs_ioc_channel_program, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_channel_program, ARRAY_SIZE(zfs_keys_channel_program)); zfs_ioctl_register("redact", ZFS_IOC_REDACT, zfs_ioc_redact, zfs_secpolicy_config, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_redact, ARRAY_SIZE(zfs_keys_redact)); zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT, zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_checkpoint, ARRAY_SIZE(zfs_keys_pool_checkpoint)); zfs_ioctl_register("zpool_discard_checkpoint", ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_discard_checkpoint, ARRAY_SIZE(zfs_keys_pool_discard_checkpoint)); zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE, zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize)); zfs_ioctl_register("trim", ZFS_IOC_POOL_TRIM, zfs_ioc_pool_trim, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_trim, ARRAY_SIZE(zfs_keys_pool_trim)); zfs_ioctl_register("wait", ZFS_IOC_WAIT, zfs_ioc_wait, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_wait, ARRAY_SIZE(zfs_keys_pool_wait)); zfs_ioctl_register("wait_fs", ZFS_IOC_WAIT_FS, zfs_ioc_wait_fs, zfs_secpolicy_none, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_fs_wait, ARRAY_SIZE(zfs_keys_fs_wait)); zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV, zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv)); zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV, zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE, zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv)); zfs_ioctl_register("zpool_vdev_get_props", ZFS_IOC_VDEV_GET_PROPS, zfs_ioc_vdev_get_props, zfs_secpolicy_read, POOL_NAME, POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_vdev_get_props, ARRAY_SIZE(zfs_keys_vdev_get_props)); zfs_ioctl_register("zpool_vdev_set_props", ZFS_IOC_VDEV_SET_PROPS, zfs_ioc_vdev_set_props, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_vdev_set_props, ARRAY_SIZE(zfs_keys_vdev_set_props)); /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, zfs_ioc_pool_scan); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, zfs_ioc_pool_upgrade); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, zfs_ioc_vdev_add); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, zfs_ioc_vdev_remove); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, zfs_ioc_vdev_set_state); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, zfs_ioc_vdev_attach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, zfs_ioc_vdev_detach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, zfs_ioc_vdev_setpath); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, zfs_ioc_vdev_setfru); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, zfs_ioc_pool_set_props); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, zfs_ioc_vdev_split); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, zfs_ioc_pool_reguid); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, zfs_ioc_pool_configs, zfs_secpolicy_none); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT, zfs_ioc_pool_tryimport, zfs_secpolicy_config); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT, zfs_ioc_inject_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT, zfs_ioc_clear_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT, zfs_ioc_inject_list_next, zfs_secpolicy_inject); /* * pool destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_export * does the logging of those commands. */ zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, zfs_ioc_pool_get_history, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY); zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, zfs_ioc_space_written); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, zfs_ioc_objset_recvd_props); zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, zfs_ioc_next_obj); zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL, zfs_ioc_get_fsacl); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS, zfs_ioc_objset_stats); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS, zfs_ioc_objset_zplprops); zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT, zfs_ioc_dataset_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT, zfs_ioc_snapshot_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS, zfs_ioc_send_progress); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF, zfs_ioc_diff, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS, zfs_ioc_obj_to_stats, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH, zfs_ioc_obj_to_path, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE, zfs_ioc_userspace_one, zfs_secpolicy_userspace_one); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY, zfs_ioc_userspace_many, zfs_secpolicy_userspace_many); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, zfs_ioc_send, zfs_secpolicy_send); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, zfs_secpolicy_none); zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, zfs_secpolicy_destroy); zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, zfs_secpolicy_rename); zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, zfs_secpolicy_recv); zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, zfs_secpolicy_promote); zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, zfs_secpolicy_set_fsacl); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, zfs_secpolicy_share, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE, zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT, zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_NEXT, zfs_ioc_events_next, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_CLEAR, zfs_ioc_events_clear, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_SEEK, zfs_ioc_events_seek, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_init_os(); } /* * Verify that for non-legacy ioctls the input nvlist * pairs match against the expected input. * * Possible errors are: * ZFS_ERR_IOC_ARG_UNAVAIL An unrecognized nvpair was encountered * ZFS_ERR_IOC_ARG_REQUIRED A required nvpair is missing * ZFS_ERR_IOC_ARG_BADTYPE Invalid type for nvpair */ static int zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec) { const zfs_ioc_key_t *nvl_keys = vec->zvec_nvl_keys; boolean_t required_keys_found = B_FALSE; /* * examine each input pair */ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *name = nvpair_name(pair); data_type_t type = nvpair_type(pair); boolean_t identified = B_FALSE; /* * check pair against the documented names and type */ for (int k = 0; k < vec->zvec_nvl_key_count; k++) { /* if not a wild card name, check for an exact match */ if ((nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) == 0 && strcmp(nvl_keys[k].zkey_name, name) != 0) continue; identified = B_TRUE; if (nvl_keys[k].zkey_type != DATA_TYPE_ANY && nvl_keys[k].zkey_type != type) { return (SET_ERROR(ZFS_ERR_IOC_ARG_BADTYPE)); } if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) continue; required_keys_found = B_TRUE; break; } /* allow an 'optional' key, everything else is invalid */ if (!identified && (strcmp(name, "optional") != 0 || type != DATA_TYPE_NVLIST)) { return (SET_ERROR(ZFS_ERR_IOC_ARG_UNAVAIL)); } } /* verify that all required keys were found */ for (int k = 0; k < vec->zvec_nvl_key_count; k++) { if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) continue; if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) { /* at least one non-optional key is expected here */ if (!required_keys_found) return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); continue; } if (!nvlist_exists(innvl, nvl_keys[k].zkey_name)) return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); } return (0); } static int pool_status_check(const char *name, zfs_ioc_namecheck_t type, zfs_ioc_poolcheck_t check) { spa_t *spa; int error; ASSERT(type == POOL_NAME || type == DATASET_NAME || type == ENTITY_NAME); if (check & POOL_CHECK_NONE) return (0); error = spa_open(name, &spa, FTAG); if (error == 0) { if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) error = SET_ERROR(EAGAIN); else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) error = SET_ERROR(EROFS); spa_close(spa, FTAG); } return (error); } int zfsdev_getminor(zfs_file_t *fp, minor_t *minorp) { zfsdev_state_t *zs, *fpd; ASSERT(!MUTEX_HELD(&zfsdev_state_lock)); fpd = zfs_file_private(fp); if (fpd == NULL) return (SET_ERROR(EBADF)); mutex_enter(&zfsdev_state_lock); for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == -1) continue; if (fpd == zs) { *minorp = fpd->zs_minor; mutex_exit(&zfsdev_state_lock); return (0); } } mutex_exit(&zfsdev_state_lock); return (SET_ERROR(EBADF)); } void * zfsdev_get_state(minor_t minor, enum zfsdev_state_type which) { zfsdev_state_t *zs; for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == minor) { membar_consumer(); switch (which) { case ZST_ONEXIT: return (zs->zs_onexit); case ZST_ZEVENT: return (zs->zs_zevent); case ZST_ALL: return (zs); } } } return (NULL); } /* * Find a free minor number. The zfsdev_state_list is expected to * be short since it is only a list of currently open file handles. */ static minor_t zfsdev_minor_alloc(void) { static minor_t last_minor = 0; minor_t m; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); for (m = last_minor + 1; m != last_minor; m++) { if (m > ZFSDEV_MAX_MINOR) m = 1; if (zfsdev_get_state(m, ZST_ALL) == NULL) { last_minor = m; return (m); } } return (0); } int zfsdev_state_init(void *priv) { zfsdev_state_t *zs, *zsprev = NULL; minor_t minor; boolean_t newzs = B_FALSE; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); minor = zfsdev_minor_alloc(); if (minor == 0) return (SET_ERROR(ENXIO)); for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == -1) break; zsprev = zs; } if (!zs) { zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); newzs = B_TRUE; } zfsdev_private_set_state(priv, zs); zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit); zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent); /* * In order to provide for lock-free concurrent read access * to the minor list in zfsdev_get_state(), new entries * must be completely written before linking them into the * list whereas existing entries are already linked; the last * operation must be updating zs_minor (from -1 to the new * value). */ if (newzs) { zs->zs_minor = minor; membar_producer(); zsprev->zs_next = zs; } else { membar_producer(); zs->zs_minor = minor; } return (0); } void zfsdev_state_destroy(void *priv) { zfsdev_state_t *zs = zfsdev_private_get_state(priv); ASSERT(zs != NULL); ASSERT3S(zs->zs_minor, >, 0); /* * The last reference to this zfsdev file descriptor is being dropped. * We don't have to worry about lookup grabbing this state object, and * zfsdev_state_init() will not try to reuse this object until it is * invalidated by setting zs_minor to -1. Invalidation must be done * last, with a memory barrier to ensure ordering. This lets us avoid * taking the global zfsdev state lock around destruction. */ zfs_onexit_destroy(zs->zs_onexit); zfs_zevent_destroy(zs->zs_zevent); zs->zs_onexit = NULL; zs->zs_zevent = NULL; membar_producer(); zs->zs_minor = -1; } long zfsdev_ioctl_common(uint_t vecnum, zfs_cmd_t *zc, int flag) { int error, cmd; const zfs_ioc_vec_t *vec; char *saved_poolname = NULL; uint64_t max_nvlist_src_size; size_t saved_poolname_len = 0; nvlist_t *innvl = NULL; fstrans_cookie_t cookie; hrtime_t start_time = gethrtime(); cmd = vecnum; error = 0; if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); vec = &zfs_ioc_vec[vecnum]; /* * The registered ioctl list may be sparse, verify that either * a normal or legacy handler are registered. */ if (vec->zvec_func == NULL && vec->zvec_legacy_func == NULL) return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); zc->zc_iflags = flag & FKIOCTL; max_nvlist_src_size = zfs_max_nvlist_src_size_os(); if (zc->zc_nvlist_src_size > max_nvlist_src_size) { /* * Make sure the user doesn't pass in an insane value for * zc_nvlist_src_size. We have to check, since we will end * up allocating that much memory inside of get_nvlist(). This * prevents a nefarious user from allocating tons of kernel * memory. * * Also, we return EINVAL instead of ENOMEM here. The reason * being that returning ENOMEM from an ioctl() has a special * connotation; that the user's size value is too small and * needs to be expanded to hold the nvlist. See * zcmd_expand_dst_nvlist() for details. */ error = SET_ERROR(EINVAL); /* User's size too big */ } else if (zc->zc_nvlist_src_size != 0) { error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &innvl); if (error != 0) goto out; } /* * Ensure that all pool/dataset names are valid before we pass down to * the lower layers. */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; switch (vec->zvec_namecheck) { case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case DATASET_NAME: if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case ENTITY_NAME: if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) { error = SET_ERROR(EINVAL); } else { error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); } break; case NO_NAME: break; } /* * Ensure that all input pairs are valid before we pass them down * to the lower layers. * * The vectored functions can use fnvlist_lookup_{type} for any * required pairs since zfs_check_input_nvpairs() confirmed that * they exist and are of the correct type. */ if (error == 0 && vec->zvec_func != NULL) { error = zfs_check_input_nvpairs(innvl, vec); if (error != 0) goto out; } if (error == 0) { cookie = spl_fstrans_mark(); error = vec->zvec_secpolicy(zc, innvl, CRED()); spl_fstrans_unmark(cookie); } if (error != 0) goto out; /* legacy ioctls can modify zc_name */ /* * Can't use kmem_strdup() as we might truncate the string and * kmem_strfree() would then free with incorrect size. */ saved_poolname_len = strlen(zc->zc_name) + 1; saved_poolname = kmem_alloc(saved_poolname_len, KM_SLEEP); strlcpy(saved_poolname, zc->zc_name, saved_poolname_len); saved_poolname[strcspn(saved_poolname, "/@#")] = '\0'; if (vec->zvec_func != NULL) { nvlist_t *outnvl; int puterror = 0; spa_t *spa; nvlist_t *lognv = NULL; ASSERT(vec->zvec_legacy_func == NULL); /* * Add the innvl to the lognv before calling the func, * in case the func changes the innvl. */ if (vec->zvec_allow_log) { lognv = fnvlist_alloc(); fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, vec->zvec_name); if (!nvlist_empty(innvl)) { fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL, innvl); } } outnvl = fnvlist_alloc(); cookie = spl_fstrans_mark(); error = vec->zvec_func(zc->zc_name, innvl, outnvl); spl_fstrans_unmark(cookie); /* * Some commands can partially execute, modify state, and still * return an error. In these cases, attempt to record what * was modified. */ if ((error == 0 || (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) && vec->zvec_allow_log && spa_open(zc->zc_name, &spa, FTAG) == 0) { if (!nvlist_empty(outnvl)) { size_t out_size = fnvlist_size(outnvl); if (out_size > zfs_history_output_max) { fnvlist_add_int64(lognv, ZPOOL_HIST_OUTPUT_SIZE, out_size); } else { fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, outnvl); } } if (error != 0) { fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO, error); } fnvlist_add_int64(lognv, ZPOOL_HIST_ELAPSED_NS, gethrtime() - start_time); (void) spa_history_log_nvl(spa, lognv); spa_close(spa, FTAG); } fnvlist_free(lognv); if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { int smusherror = 0; if (vec->zvec_smush_outnvlist) { smusherror = nvlist_smush(outnvl, zc->zc_nvlist_dst_size); } if (smusherror == 0) puterror = put_nvlist(zc, outnvl); } if (puterror != 0) error = puterror; nvlist_free(outnvl); } else { cookie = spl_fstrans_mark(); error = vec->zvec_legacy_func(zc); spl_fstrans_unmark(cookie); } out: nvlist_free(innvl); if (error == 0 && vec->zvec_allow_log) { char *s = tsd_get(zfs_allow_log_key); if (s != NULL) kmem_strfree(s); (void) tsd_set(zfs_allow_log_key, kmem_strdup(saved_poolname)); } if (saved_poolname != NULL) kmem_free(saved_poolname, saved_poolname_len); return (error); } int zfs_kmod_init(void) { int error; if ((error = zvol_init()) != 0) return (error); spa_init(SPA_MODE_READ | SPA_MODE_WRITE); zfs_init(); zfs_ioctl_init(); mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL); zfsdev_state_list = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); zfsdev_state_list->zs_minor = -1; if ((error = zfsdev_attach()) != 0) goto out; tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); return (0); out: zfs_fini(); spa_fini(); zvol_fini(); return (error); } void zfs_kmod_fini(void) { zfsdev_state_t *zs, *zsnext = NULL; zfsdev_detach(); mutex_destroy(&zfsdev_state_lock); for (zs = zfsdev_state_list; zs != NULL; zs = zsnext) { zsnext = zs->zs_next; if (zs->zs_onexit) zfs_onexit_destroy(zs->zs_onexit); if (zs->zs_zevent) zfs_zevent_destroy(zs->zs_zevent); kmem_free(zs, sizeof (zfsdev_state_t)); } zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */ zfs_fini(); spa_fini(); zvol_fini(); tsd_destroy(&zfs_fsyncer_key); tsd_destroy(&rrw_tsd_key); tsd_destroy(&zfs_allow_log_key); } ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, ULONG, ZMOD_RW, "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls"); ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, ULONG, ZMOD_RW, "Maximum size in bytes of ZFS ioctl output that will be logged"); diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 4f7cb8430d3e..3598351c499d 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -1,972 +1,972 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ /* * ZFS fault injection * * To handle fault injection, we keep track of a series of zinject_record_t * structures which describe which logical block(s) should be injected with a * fault. These are kept in a global list. Each record corresponds to a given * spa_t and maintains a special hold on the spa_t so that it cannot be deleted * or exported while the injection record exists. * * Device level injection is done using the 'zi_guid' field. If this is set, it * means that the error is destined for a particular device, not a piece of * data. * * This is a rather poor data structure and algorithm, but we don't expect more * than a few faults at any one time, so it should be sufficient for our needs. */ #include #include #include #include #include #include #include uint32_t zio_injection_enabled = 0; /* * Data describing each zinject handler registered on the system, and * contains the list node linking the handler in the global zinject * handler list. */ typedef struct inject_handler { int zi_id; spa_t *zi_spa; zinject_record_t zi_record; uint64_t *zi_lanes; int zi_next_lane; list_node_t zi_link; } inject_handler_t; /* * List of all zinject handlers registered on the system, protected by * the inject_lock defined below. */ static list_t inject_handlers; /* * This protects insertion into, and traversal of, the inject handler * list defined above; as well as the inject_delay_count. Any time a * handler is inserted or removed from the list, this lock should be * taken as a RW_WRITER; and any time traversal is done over the list * (without modification to it) this lock should be taken as a RW_READER. */ static krwlock_t inject_lock; /* * This holds the number of zinject delay handlers that have been * registered on the system. It is protected by the inject_lock defined * above. Thus modifications to this count must be a RW_WRITER of the * inject_lock, and reads of this count must be (at least) a RW_READER * of the lock. */ static int inject_delay_count = 0; /* * This lock is used only in zio_handle_io_delay(), refer to the comment * in that function for more details. */ static kmutex_t inject_delay_mtx; /* * Used to assign unique identifying numbers to each new zinject handler. */ static int inject_next_id = 1; /* * Test if the requested frequency was triggered */ static boolean_t freq_triggered(uint32_t frequency) { /* * zero implies always (100%) */ if (frequency == 0) return (B_TRUE); /* * Note: we still handle legacy (unscaled) frequency values */ uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX; return (random_in_range(maximum) < frequency); } /* * Returns true if the given record matches the I/O in progress. */ static boolean_t zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva, zinject_record_t *record, int error) { /* * Check for a match against the MOS, which is based on type */ if (zb->zb_objset == DMU_META_OBJSET && record->zi_objset == DMU_META_OBJSET && record->zi_object == DMU_META_DNODE_OBJECT) { if (record->zi_type == DMU_OT_NONE || type == record->zi_type) return (freq_triggered(record->zi_freq)); else return (B_FALSE); } /* * Check for an exact match. */ if (zb->zb_objset == record->zi_objset && zb->zb_object == record->zi_object && zb->zb_level == record->zi_level && zb->zb_blkid >= record->zi_start && zb->zb_blkid <= record->zi_end && (record->zi_dvas == 0 || (dva != ZI_NO_DVA && (record->zi_dvas & (1ULL << dva)))) && error == record->zi_error) { return (freq_triggered(record->zi_freq)); } return (B_FALSE); } /* * Panic the system when a config change happens in the function * specified by tag. */ void zio_handle_panic_injection(spa_t *spa, const char *tag, uint64_t type) { inject_handler_t *handler; rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (spa != handler->zi_spa) continue; if (handler->zi_record.zi_type == type && strcmp(tag, handler->zi_record.zi_func) == 0) panic("Panic requested in function %s\n", tag); } rw_exit(&inject_lock); } /* * Inject a decryption failure. Decryption failures can occur in * both the ARC and the ZIO layers. */ int zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, uint64_t type, int error) { int ret = 0; inject_handler_t *handler; rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (spa != handler->zi_spa || handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT) continue; if (zio_match_handler(zb, type, ZI_NO_DVA, &handler->zi_record, error)) { ret = error; break; } } rw_exit(&inject_lock); return (ret); } /* * If this is a physical I/O for a vdev child determine which DVA it is * for. We iterate backwards through the DVAs matching on the offset so * that we end up with ZI_NO_DVA (-1) if we don't find a match. */ static int zio_match_dva(zio_t *zio) { int i = ZI_NO_DVA; if (zio->io_bp != NULL && zio->io_vd != NULL && zio->io_child_type == ZIO_CHILD_VDEV) { for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) { dva_t *dva = &zio->io_bp->blk_dva[i]; uint64_t off = DVA_GET_OFFSET(dva); vdev_t *vd = vdev_lookup_top(zio->io_spa, DVA_GET_VDEV(dva)); /* Compensate for vdev label added to leaves */ if (zio->io_vd->vdev_ops->vdev_op_leaf) off += VDEV_LABEL_START_SIZE; if (zio->io_vd == vd && zio->io_offset == off) break; } } return (i); } /* * Determine if the I/O in question should return failure. Returns the errno * to be returned to the caller. */ int zio_handle_fault_injection(zio_t *zio, int error) { int ret = 0; inject_handler_t *handler; /* * Ignore I/O not associated with any logical data. */ if (zio->io_logical == NULL) return (0); /* * Currently, we only support fault injection on reads. */ if (zio->io_type != ZIO_TYPE_READ) return (0); /* * A rebuild I/O has no checksum to verify. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM) return (0); rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (zio->io_spa != handler->zi_spa || handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) continue; /* If this handler matches, return the specified error */ if (zio_match_handler(&zio->io_logical->io_bookmark, zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, zio_match_dva(zio), &handler->zi_record, error)) { ret = error; break; } } rw_exit(&inject_lock); return (ret); } /* * Determine if the zio is part of a label update and has an injection * handler associated with that portion of the label. Currently, we * allow error injection in either the nvlist or the uberblock region of * of the vdev label. */ int zio_handle_label_injection(zio_t *zio, int error) { inject_handler_t *handler; vdev_t *vd = zio->io_vd; uint64_t offset = zio->io_offset; int label; int ret = 0; if (offset >= VDEV_LABEL_START_SIZE && offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) return (0); rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { uint64_t start = handler->zi_record.zi_start; uint64_t end = handler->zi_record.zi_end; if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) continue; /* * The injection region is the relative offsets within a * vdev label. We must determine the label which is being * updated and adjust our region accordingly. */ label = vdev_label_number(vd->vdev_psize, offset); start = vdev_label_offset(vd->vdev_psize, label, start); end = vdev_label_offset(vd->vdev_psize, label, end); if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && (offset >= start && offset <= end)) { ret = error; break; } } rw_exit(&inject_lock); return (ret); } static int zio_inject_bitflip_cb(void *data, size_t len, void *private) { zio_t *zio = private; uint8_t *buffer = data; uint_t byte = random_in_range(len); ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); /* flip a single random bit in an abd data buffer */ buffer[byte] ^= 1 << random_in_range(8); return (1); /* stop after first flip */ } static int zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) { inject_handler_t *handler; int ret = 0; /* * We skip over faults in the labels unless it's during * device open (i.e. zio == NULL). */ if (zio != NULL) { uint64_t offset = zio->io_offset; if (offset < VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) return (0); } rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) continue; if (vd->vdev_guid == handler->zi_record.zi_guid) { if (handler->zi_record.zi_failfast && (zio == NULL || (zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { continue; } /* Handle type specific I/O failures */ if (zio != NULL && handler->zi_record.zi_iotype != ZIO_TYPES && handler->zi_record.zi_iotype != zio->io_type) continue; if (handler->zi_record.zi_error == err1 || handler->zi_record.zi_error == err2) { /* * limit error injection if requested */ if (!freq_triggered(handler->zi_record.zi_freq)) continue; /* * For a failed open, pretend like the device * has gone away. */ if (err1 == ENXIO) vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; /* * Treat these errors as if they had been * retried so that all the appropriate stats * and FMA events are generated. */ if (!handler->zi_record.zi_failfast && zio != NULL) zio->io_flags |= ZIO_FLAG_IO_RETRY; /* * EILSEQ means flip a bit after a read */ if (handler->zi_record.zi_error == EILSEQ) { if (zio == NULL) break; /* locate buffer data and flip a bit */ (void) abd_iterate_func(zio->io_abd, 0, zio->io_size, zio_inject_bitflip_cb, zio); break; } ret = handler->zi_record.zi_error; break; } if (handler->zi_record.zi_error == ENXIO) { ret = SET_ERROR(EIO); break; } } } rw_exit(&inject_lock); return (ret); } int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) { return (zio_handle_device_injection_impl(vd, zio, error, INT_MAX)); } int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2) { return (zio_handle_device_injection_impl(vd, zio, err1, err2)); } /* * Simulate hardware that ignores cache flushes. For requested number * of seconds nix the actual writing to disk. */ void zio_handle_ignored_writes(zio_t *zio) { inject_handler_t *handler; rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { /* Ignore errors not destined for this pool */ if (zio->io_spa != handler->zi_spa || handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; /* * Positive duration implies # of seconds, negative * a number of txgs */ if (handler->zi_record.zi_timer == 0) { if (handler->zi_record.zi_duration > 0) handler->zi_record.zi_timer = ddi_get_lbolt64(); else handler->zi_record.zi_timer = zio->io_txg; } /* Have a "problem" writing 60% of the time */ if (random_in_range(100) < 60) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; break; } rw_exit(&inject_lock); } void spa_handle_ignored_writes(spa_t *spa) { inject_handler_t *handler; if (zio_injection_enabled == 0) return; rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (spa != handler->zi_spa || handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; if (handler->zi_record.zi_duration > 0) { VERIFY(handler->zi_record.zi_timer == 0 || ddi_time_after64( (int64_t)handler->zi_record.zi_timer + handler->zi_record.zi_duration * hz, ddi_get_lbolt64())); } else { /* duration is negative so the subtraction here adds */ VERIFY(handler->zi_record.zi_timer == 0 || handler->zi_record.zi_timer - handler->zi_record.zi_duration >= spa_syncing_txg(spa)); } } rw_exit(&inject_lock); } hrtime_t zio_handle_io_delay(zio_t *zio) { vdev_t *vd = zio->io_vd; inject_handler_t *min_handler = NULL; hrtime_t min_target = 0; rw_enter(&inject_lock, RW_READER); /* * inject_delay_count is a subset of zio_injection_enabled that * is only incremented for delay handlers. These checks are * mainly added to remind the reader why we're not explicitly * checking zio_injection_enabled like the other functions. */ IMPLY(inject_delay_count > 0, zio_injection_enabled > 0); IMPLY(zio_injection_enabled == 0, inject_delay_count == 0); /* * If there aren't any inject delay handlers registered, then we * can short circuit and simply return 0 here. A value of zero * informs zio_delay_interrupt() that this request should not be * delayed. This short circuit keeps us from acquiring the * inject_delay_mutex unnecessarily. */ if (inject_delay_count == 0) { rw_exit(&inject_lock); return (0); } /* * Each inject handler has a number of "lanes" associated with * it. Each lane is able to handle requests independently of one * another, and at a latency defined by the inject handler * record's zi_timer field. Thus if a handler in configured with * a single lane with a 10ms latency, it will delay requests * such that only a single request is completed every 10ms. So, * if more than one request is attempted per each 10ms interval, * the average latency of the requests will be greater than * 10ms; but if only a single request is submitted each 10ms * interval the average latency will be 10ms. * * We need to acquire this mutex to prevent multiple concurrent * threads being assigned to the same lane of a given inject * handler. The mutex allows us to perform the following two * operations atomically: * * 1. determine the minimum handler and minimum target * value of all the possible handlers * 2. update that minimum handler's lane array * * Without atomicity, two (or more) threads could pick the same * lane in step (1), and then conflict with each other in step * (2). This could allow a single lane handler to process * multiple requests simultaneously, which shouldn't be possible. */ mutex_enter(&inject_delay_mtx); for (inject_handler_t *handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) continue; if (!freq_triggered(handler->zi_record.zi_freq)) continue; if (vd->vdev_guid != handler->zi_record.zi_guid) continue; /* * Defensive; should never happen as the array allocation * occurs prior to inserting this handler on the list. */ ASSERT3P(handler->zi_lanes, !=, NULL); /* * This should never happen, the zinject command should * prevent a user from setting an IO delay with zero lanes. */ ASSERT3U(handler->zi_record.zi_nlanes, !=, 0); ASSERT3U(handler->zi_record.zi_nlanes, >, handler->zi_next_lane); /* * We want to issue this IO to the lane that will become * idle the soonest, so we compare the soonest this * specific handler can complete the IO with all other * handlers, to find the lowest value of all possible * lanes. We then use this lane to submit the request. * * Since each handler has a constant value for its * delay, we can just use the "next" lane for that * handler; as it will always be the lane with the * lowest value for that particular handler (i.e. the * lane that will become idle the soonest). This saves a * scan of each handler's lanes array. * * There's two cases to consider when determining when * this specific IO request should complete. If this * lane is idle, we want to "submit" the request now so * it will complete after zi_timer milliseconds. Thus, * we set the target to now + zi_timer. * * If the lane is busy, we want this request to complete * zi_timer milliseconds after the lane becomes idle. * Since the 'zi_lanes' array holds the time at which * each lane will become idle, we use that value to * determine when this request should complete. */ hrtime_t idle = handler->zi_record.zi_timer + gethrtime(); hrtime_t busy = handler->zi_record.zi_timer + handler->zi_lanes[handler->zi_next_lane]; hrtime_t target = MAX(idle, busy); if (min_handler == NULL) { min_handler = handler; min_target = target; continue; } ASSERT3P(min_handler, !=, NULL); ASSERT3U(min_target, !=, 0); /* * We don't yet increment the "next lane" variable since * we still might find a lower value lane in another * handler during any remaining iterations. Once we're * sure we've selected the absolute minimum, we'll claim * the lane and increment the handler's "next lane" * field below. */ if (target < min_target) { min_handler = handler; min_target = target; } } /* * 'min_handler' will be NULL if no IO delays are registered for * this vdev, otherwise it will point to the handler containing * the lane that will become idle the soonest. */ if (min_handler != NULL) { ASSERT3U(min_target, !=, 0); min_handler->zi_lanes[min_handler->zi_next_lane] = min_target; /* * If we've used all possible lanes for this handler, * loop back and start using the first lane again; * otherwise, just increment the lane index. */ min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) % min_handler->zi_record.zi_nlanes; } mutex_exit(&inject_delay_mtx); rw_exit(&inject_lock); return (min_target); } static int zio_calculate_range(const char *pool, zinject_record_t *record) { dsl_pool_t *dp; dsl_dataset_t *ds; objset_t *os = NULL; dnode_t *dn = NULL; int error; /* * Obtain the dnode for object using pool, objset, and object */ error = dsl_pool_hold(pool, FTAG, &dp); if (error) return (error); error = dsl_dataset_hold_obj(dp, record->zi_objset, FTAG, &ds); dsl_pool_rele(dp, FTAG); if (error) return (error); error = dmu_objset_from_ds(ds, &os); dsl_dataset_rele(ds, FTAG); if (error) return (error); error = dnode_hold(os, record->zi_object, FTAG, &dn); if (error) return (error); /* * Translate the range into block IDs */ if (record->zi_start != 0 || record->zi_end != -1ULL) { record->zi_start >>= dn->dn_datablkshift; record->zi_end >>= dn->dn_datablkshift; } if (record->zi_level > 0) { if (record->zi_level >= dn->dn_nlevels) { dnode_rele(dn, FTAG); return (SET_ERROR(EDOM)); } if (record->zi_start != 0 || record->zi_end != 0) { int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (int level = record->zi_level; level > 0; level--) { record->zi_start >>= shift; record->zi_end >>= shift; } } } dnode_rele(dn, FTAG); return (0); } /* * Create a new handler for the given record. We add it to the list, adding * a reference to the spa_t in the process. We increment zio_injection_enabled, * which is the switch to trigger all fault injection. */ int zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) { inject_handler_t *handler; int error; spa_t *spa; /* * If this is pool-wide metadata, make sure we unload the corresponding * spa_t, so that the next attempt to load it will trigger the fault. * We call spa_reset() to unload the pool appropriately. */ if (flags & ZINJECT_UNLOAD_SPA) if ((error = spa_reset(name)) != 0) return (error); if (record->zi_cmd == ZINJECT_DELAY_IO) { /* * A value of zero for the number of lanes or for the * delay time doesn't make sense. */ if (record->zi_timer == 0 || record->zi_nlanes == 0) return (SET_ERROR(EINVAL)); /* * The number of lanes is directly mapped to the size of * an array used by the handler. Thus, to ensure the * user doesn't trigger an allocation that's "too large" * we cap the number of lanes here. */ if (record->zi_nlanes >= UINT16_MAX) return (SET_ERROR(EINVAL)); } /* * If the supplied range was in bytes -- calculate the actual blkid */ if (flags & ZINJECT_CALC_RANGE) { error = zio_calculate_range(name, record); if (error != 0) return (error); } if (!(flags & ZINJECT_NULL)) { /* * spa_inject_ref() will add an injection reference, which will * prevent the pool from being removed from the namespace while * still allowing it to be unloaded. */ if ((spa = spa_inject_addref(name)) == NULL) return (SET_ERROR(ENOENT)); handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); handler->zi_spa = spa; handler->zi_record = *record; if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { handler->zi_lanes = kmem_zalloc( sizeof (*handler->zi_lanes) * handler->zi_record.zi_nlanes, KM_SLEEP); handler->zi_next_lane = 0; } else { handler->zi_lanes = NULL; handler->zi_next_lane = 0; } rw_enter(&inject_lock, RW_WRITER); /* * We can't move this increment into the conditional * above because we need to hold the RW_WRITER lock of * inject_lock, and we don't want to hold that while * allocating the handler's zi_lanes array. */ if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { ASSERT3S(inject_delay_count, >=, 0); inject_delay_count++; ASSERT3S(inject_delay_count, >, 0); } *id = handler->zi_id = inject_next_id++; list_insert_tail(&inject_handlers, handler); atomic_inc_32(&zio_injection_enabled); rw_exit(&inject_lock); } /* * Flush the ARC, so that any attempts to read this data will end up * going to the ZIO layer. Note that this is a little overkill, but * we don't have the necessary ARC interfaces to do anything else, and * fault injection isn't a performance critical path. */ if (flags & ZINJECT_FLUSH_ARC) /* * We must use FALSE to ensure arc_flush returns, since * we're not preventing concurrent ARC insertions. */ arc_flush(NULL, FALSE); return (0); } /* * Returns the next record with an ID greater than that supplied to the * function. Used to iterate over all handlers in the system. */ int zio_inject_list_next(int *id, char *name, size_t buflen, zinject_record_t *record) { inject_handler_t *handler; int ret; mutex_enter(&spa_namespace_lock); rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) if (handler->zi_id > *id) break; if (handler) { *record = handler->zi_record; *id = handler->zi_id; - (void) strncpy(name, spa_name(handler->zi_spa), buflen); + (void) strlcpy(name, spa_name(handler->zi_spa), buflen); ret = 0; } else { ret = SET_ERROR(ENOENT); } rw_exit(&inject_lock); mutex_exit(&spa_namespace_lock); return (ret); } /* * Clear the fault handler with the given identifier, or return ENOENT if none * exists. */ int zio_clear_fault(int id) { inject_handler_t *handler; rw_enter(&inject_lock, RW_WRITER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) if (handler->zi_id == id) break; if (handler == NULL) { rw_exit(&inject_lock); return (SET_ERROR(ENOENT)); } if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { ASSERT3S(inject_delay_count, >, 0); inject_delay_count--; ASSERT3S(inject_delay_count, >=, 0); } list_remove(&inject_handlers, handler); rw_exit(&inject_lock); if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { ASSERT3P(handler->zi_lanes, !=, NULL); kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) * handler->zi_record.zi_nlanes); } else { ASSERT3P(handler->zi_lanes, ==, NULL); } spa_inject_delref(handler->zi_spa); kmem_free(handler, sizeof (inject_handler_t)); atomic_dec_32(&zio_injection_enabled); return (0); } void zio_inject_init(void) { rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&inject_handlers, sizeof (inject_handler_t), offsetof(inject_handler_t, zi_link)); } void zio_inject_fini(void) { list_destroy(&inject_handlers); mutex_destroy(&inject_delay_mtx); rw_destroy(&inject_lock); } #if defined(_KERNEL) EXPORT_SYMBOL(zio_injection_enabled); EXPORT_SYMBOL(zio_inject_fault); EXPORT_SYMBOL(zio_inject_list_next); EXPORT_SYMBOL(zio_clear_fault); EXPORT_SYMBOL(zio_handle_fault_injection); EXPORT_SYMBOL(zio_handle_device_injection); EXPORT_SYMBOL(zio_handle_label_injection); #endif diff --git a/tests/zfs-tests/cmd/draid.c b/tests/zfs-tests/cmd/draid.c index 39b58a709cec..76fdb4e8417f 100644 --- a/tests/zfs-tests/cmd/draid.c +++ b/tests/zfs-tests/cmd/draid.c @@ -1,1407 +1,1407 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2018 Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. */ #include #include #include #include #include #include /* * The number of rows to generate for new permutation maps. */ #define MAP_ROWS_DEFAULT 256 /* * Key values for dRAID maps when stored as nvlists. */ #define MAP_SEED "seed" #define MAP_CHECKSUM "checksum" #define MAP_WORST_RATIO "worst_ratio" #define MAP_AVG_RATIO "avg_ratio" #define MAP_CHILDREN "children" #define MAP_NPERMS "nperms" #define MAP_PERMS "perms" static void draid_usage(void) { (void) fprintf(stderr, "usage: draid command args ...\n" "Available commands are:\n" "\n" "\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n" "\tdraid verify [-rv] FILE\n" "\tdraid dump [-v] [-m min] [-n max] FILE\n" "\tdraid table FILE\n" "\tdraid merge FILE SRC SRC...\n"); exit(1); } static int read_map(const char *filename, nvlist_t **allcfgs) { int block_size = 131072; int buf_size = 131072; int tmp_size, error; char *tmp_buf; struct stat64 stat; if (lstat64(filename, &stat) != 0) return (errno); if (stat.st_size == 0 || !(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) { return (EINVAL); } gzFile fp = gzopen(filename, "rb"); if (fp == Z_NULL) return (errno); char *buf = malloc(buf_size); if (buf == NULL) { (void) gzclose(fp); return (ENOMEM); } ssize_t rc, bytes = 0; while (!gzeof(fp)) { rc = gzread(fp, buf + bytes, block_size); if ((rc < 0) || (rc == 0 && !gzeof(fp))) { free(buf); (void) gzclose(fp); (void) gzerror(fp, &error); return (error); } else { bytes += rc; if (bytes + block_size >= buf_size) { tmp_size = 2 * buf_size; tmp_buf = malloc(tmp_size); if (tmp_buf == NULL) { free(buf); (void) gzclose(fp); return (ENOMEM); } memcpy(tmp_buf, buf, bytes); free(buf); buf = tmp_buf; buf_size = tmp_size; } } } (void) gzclose(fp); error = nvlist_unpack(buf, bytes, allcfgs, 0); free(buf); return (error); } /* * Read a map from the specified filename. A file contains multiple maps * which are indexed by the number of children. The caller is responsible * for freeing the configuration returned. */ static int read_map_key(const char *filename, const char *key, nvlist_t **cfg) { nvlist_t *allcfgs, *foundcfg = NULL; int error; error = read_map(filename, &allcfgs); if (error != 0) return (error); (void) nvlist_lookup_nvlist(allcfgs, key, &foundcfg); if (foundcfg != NULL) { nvlist_dup(foundcfg, cfg, KM_SLEEP); error = 0; } else { error = ENOENT; } nvlist_free(allcfgs); return (error); } /* * Write all mappings to the map file. */ static int write_map(const char *filename, nvlist_t *allcfgs) { size_t buflen = 0; int error; error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR); if (error) return (error); char *buf = malloc(buflen); if (buf == NULL) return (ENOMEM); error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); if (error) { free(buf); return (error); } /* * Atomically update the file using a temporary file and the * traditional unlink then rename steps. This code provides * no locking, it only guarantees the packed nvlist on disk * is updated atomically and is internally consistent. */ char *tmpname = calloc(1, MAXPATHLEN); if (tmpname == NULL) { free(buf); return (ENOMEM); } snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename); int fd = mkstemp(tmpname); if (fd < 0) { error = errno; free(buf); free(tmpname); return (error); } (void) close(fd); gzFile fp = gzopen(tmpname, "w9b"); if (fp == Z_NULL) { error = errno; free(buf); free(tmpname); return (errno); } ssize_t rc, bytes = 0; while (bytes < buflen) { size_t size = MIN(buflen - bytes, 131072); rc = gzwrite(fp, buf + bytes, size); if (rc < 0) { free(buf); (void) gzerror(fp, &error); (void) gzclose(fp); (void) unlink(tmpname); free(tmpname); return (error); } else if (rc == 0) { break; } else { bytes += rc; } } free(buf); (void) gzclose(fp); if (bytes != buflen) { (void) unlink(tmpname); free(tmpname); return (EIO); } /* * Unlink the previous config file and replace it with the updated * version. If we're able to unlink the file then directory is * writable by us and the subsequent rename should never fail. */ error = unlink(filename); if (error != 0 && errno != ENOENT) { error = errno; (void) unlink(tmpname); free(tmpname); return (error); } error = rename(tmpname, filename); if (error != 0) { error = errno; (void) unlink(tmpname); free(tmpname); return (error); } free(tmpname); return (0); } /* * Add the dRAID map to the file and write it out. */ static int write_map_key(const char *filename, char *key, draid_map_t *map, double worst_ratio, double avg_ratio) { nvlist_t *nv_cfg, *allcfgs; int error; /* * Add the configuration to an existing or new file. The new * configuration will replace an existing configuration with the * same key if it has a lower ratio and is therefore better. */ error = read_map(filename, &allcfgs); if (error == ENOENT) { allcfgs = fnvlist_alloc(); } else if (error != 0) { return (error); } error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg); if (error == 0) { uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg, MAP_WORST_RATIO); double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0; if (worst_ratio < nv_worst_ratio) { /* Replace old map with the more balanced new map. */ fnvlist_remove(allcfgs, key); } else { /* The old map is preferable, keep it. */ nvlist_free(allcfgs); return (EEXIST); } } nvlist_t *cfg = fnvlist_alloc(); fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed); fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum); fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children); fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms); fnvlist_add_uint8_array(cfg, MAP_PERMS, map->dm_perms, map->dm_children * map->dm_nperms * sizeof (uint8_t)); fnvlist_add_uint64(cfg, MAP_WORST_RATIO, (uint64_t)(worst_ratio * 1000.0)); fnvlist_add_uint64(cfg, MAP_AVG_RATIO, (uint64_t)(avg_ratio * 1000.0)); error = nvlist_add_nvlist(allcfgs, key, cfg); if (error == 0) error = write_map(filename, allcfgs); nvlist_free(cfg); nvlist_free(allcfgs); return (error); } static void dump_map(draid_map_t *map, const char *key, double worst_ratio, double avg_ratio, int verbose) { if (verbose == 0) { return; } else if (verbose == 1) { printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f " "avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed, worst_ratio, avg_ratio); return; } else { printf(" \"%s\":\n" " seed: 0x%016llx\n" " checksum: 0x%016llx\n" " worst_ratio: %2.03f\n" " avg_ratio: %2.03f\n" " children: %llu\n" " nperms: %llu\n", key, (u_longlong_t)map->dm_seed, (u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio, (u_longlong_t)map->dm_children, (u_longlong_t)map->dm_nperms); if (verbose > 2) { printf(" perms = {\n"); for (int i = 0; i < map->dm_nperms; i++) { printf(" { "); for (int j = 0; j < map->dm_children; j++) { printf("%3d%s ", map->dm_perms[ i * map->dm_children + j], j < map->dm_children - 1 ? "," : ""); } printf(" },\n"); } printf(" }\n"); } else if (verbose == 2) { printf(" draid_perms = \n"); } } } static void dump_map_nv(const char *key, nvlist_t *cfg, int verbose) { draid_map_t map; uint_t c; uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO); uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); map.dm_perms = fnvlist_lookup_uint8_array(cfg, MAP_PERMS, &c); dump_map(&map, key, (double)worst_ratio / 1000.0, avg_ratio / 1000.0, verbose); } /* * Print a summary of the mapping. */ static int dump_map_key(const char *filename, const char *key, int verbose) { nvlist_t *cfg; int error; error = read_map_key(filename, key, &cfg); if (error != 0) return (error); dump_map_nv(key, cfg, verbose); return (0); } /* * Allocate a new permutation map for evaluation. */ static int alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed, draid_map_t **mapp) { draid_map_t *map; int error; map = malloc(sizeof (draid_map_t)); if (map == NULL) return (ENOMEM); map->dm_children = children; map->dm_nperms = nperms; map->dm_seed = seed; map->dm_checksum = 0; error = vdev_draid_generate_perms(map, &map->dm_perms); if (error) { free(map); return (error); } *mapp = map; return (0); } /* * Allocate the fixed permutation map for N children. */ static int alloc_fixed_map(uint64_t children, draid_map_t **mapp) { const draid_map_t *fixed_map; draid_map_t *map; int error; error = vdev_draid_lookup_map(children, &fixed_map); if (error) return (error); map = malloc(sizeof (draid_map_t)); if (map == NULL) return (ENOMEM); memcpy(map, fixed_map, sizeof (draid_map_t)); VERIFY3U(map->dm_checksum, !=, 0); error = vdev_draid_generate_perms(map, &map->dm_perms); if (error) { free(map); return (error); } *mapp = map; return (0); } /* * Free a permutation map. */ static void free_map(draid_map_t *map) { free(map->dm_perms); free(map); } /* * Check if dev is in the provided list of faulted devices. */ static inline boolean_t is_faulted(int *faulted_devs, int nfaulted, int dev) { for (int i = 0; i < nfaulted; i++) if (faulted_devs[i] == dev) return (B_TRUE); return (B_FALSE); } /* * Evaluate how resilvering I/O will be distributed given a list of faulted * vdevs. As a simplification we assume one IO is sufficient to repair each * damaged device in a group. */ static double eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares, int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios) { uint64_t children = map->dm_children; uint64_t ngroups = 1; uint64_t ndisks = children - nspares; /* * Calculate the minimum number of groups required to fill a slice. */ while (ngroups * (groupwidth) % (children - nspares) != 0) ngroups++; int *ios = calloc(map->dm_children, sizeof (uint64_t)); /* Resilver all rows */ for (int i = 0; i < map->dm_nperms; i++) { uint8_t *row = &map->dm_perms[i * map->dm_children]; /* Resilver all groups with faulted drives */ for (int j = 0; j < ngroups; j++) { uint64_t spareidx = map->dm_children - nspares; boolean_t repair_needed = B_FALSE; /* See if any devices in this group are faulted */ uint64_t groupstart = (j * groupwidth) % ndisks; for (int k = 0; k < groupwidth; k++) { uint64_t groupidx = (groupstart + k) % ndisks; repair_needed = is_faulted(faulted_devs, nfaulted, row[groupidx]); if (repair_needed) break; } if (repair_needed == B_FALSE) continue; /* * This group is degraded. Calculate the number of * reads the non-faulted drives require and the number * of writes to the distributed hot spare for this row. */ for (int k = 0; k < groupwidth; k++) { uint64_t groupidx = (groupstart + k) % ndisks; if (!is_faulted(faulted_devs, nfaulted, row[groupidx])) { ios[row[groupidx]]++; } else if (nspares > 0) { while (is_faulted(faulted_devs, nfaulted, row[spareidx])) { spareidx++; } ASSERT3U(spareidx, <, map->dm_children); ios[row[spareidx]]++; spareidx++; } } } } *min_child_ios = INT_MAX; *max_child_ios = 0; /* * Find the drives with fewest and most required I/O. These values * are used to calculate the imbalance ratio. To avoid returning an * infinite value for permutations which have children that perform * no IO a floor of 1 IO per child is set. This ensures a meaningful * ratio is returned for comparison and it is not an uncommon when * there are a large number of children. */ for (int i = 0; i < map->dm_children; i++) { if (is_faulted(faulted_devs, nfaulted, i)) { ASSERT0(ios[i]); continue; } if (ios[i] == 0) ios[i] = 1; if (ios[i] < *min_child_ios) *min_child_ios = ios[i]; if (ios[i] > *max_child_ios) *max_child_ios = ios[i]; } ASSERT3S(*min_child_ios, !=, INT_MAX); ASSERT3S(*max_child_ios, !=, 0); double ratio = (double)(*max_child_ios) / (double)(*min_child_ios); free(ios); return (ratio); } /* * Evaluate the quality of the permutation mapping by considering possible * device failures. Returns the imbalance ratio for the worst mapping which * is defined to be the largest number of child IOs over the fewest number * child IOs. A value of 1.0 indicates the mapping is perfectly balance and * all children perform an equal amount of work during reconstruction. */ static void eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop) { uint64_t children = map->dm_children; double worst_ratio = 1.0; double sum = 0; int worst_min_ios = 0, worst_max_ios = 0; int n = 0; /* * When there are only 2 children there can be no distributed * spare and no resilver to evaluate. Default to a ratio of 1.0 * for this degenerate case. */ if (children == VDEV_DRAID_MIN_CHILDREN) { *worst_ratiop = 1.0; *avg_ratiop = 1.0; return; } /* * Score the mapping as if it had either 1 or 2 distributed spares. */ for (int nspares = 1; nspares <= 2; nspares++) { uint64_t faults = nspares; /* * Score groupwidths up to 19. This value was chosen as the * largest reasonable width (16d+3p). dRAID pools may be still * be created with wider stripes but they are not considered in * this analysis in order to optimize for the most common cases. */ for (uint64_t groupwidth = 2; groupwidth <= MIN(children - nspares, 19); groupwidth++) { int faulted_devs[2]; int min_ios, max_ios; /* * Score possible devices faults. This is limited * to exactly one fault per distributed spare for * the purposes of this similation. */ for (int f1 = 0; f1 < children; f1++) { faulted_devs[0] = f1; double ratio; if (faults == 1) { ratio = eval_resilver(map, groupwidth, nspares, faulted_devs, faults, &min_ios, &max_ios); if (ratio > worst_ratio) { worst_ratio = ratio; worst_min_ios = min_ios; worst_max_ios = max_ios; } sum += ratio; n++; } else if (faults == 2) { for (int f2 = f1 + 1; f2 < children; f2++) { faulted_devs[1] = f2; ratio = eval_resilver(map, groupwidth, nspares, faulted_devs, faults, &min_ios, &max_ios); if (ratio > worst_ratio) { worst_ratio = ratio; worst_min_ios = min_ios; worst_max_ios = max_ios; } sum += ratio; n++; } } } } } *worst_ratiop = worst_ratio; *avg_ratiop = sum / n; /* * Log the min/max io values for particularly unbalanced maps. * Since the maps are generated entirely randomly these are possible * be exceedingly unlikely. We log it for possible investigation. */ if (worst_ratio > 100.0) { dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2); printf("worst_min_ios=%d worst_max_ios=%d\n", worst_min_ios, worst_max_ios); } } static int eval_maps(uint64_t children, int passes, uint64_t *map_seed, draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop) { draid_map_t *best_map = NULL; double best_worst_ratio = 1000.0; double best_avg_ratio = 1000.0; /* * Perform the requested number of passes evaluating randomly * generated permutation maps. Only the best version is kept. */ for (int i = 0; i < passes; i++) { double worst_ratio, avg_ratio; draid_map_t *map; int error; /* * Calculate the next seed and generate a new candidate map. */ error = alloc_new_map(children, MAP_ROWS_DEFAULT, vdev_draid_rand(map_seed), &map); if (error) { if (best_map != NULL) free_map(best_map); return (error); } /* * Consider maps with a lower worst_ratio to be of higher * quality. Some maps may have a lower avg_ratio but they * are discarded since they might include some particularly * imbalanced permutations. The average is tracked to in * order to get a sense of the average permutation quality. */ eval_decluster(map, &worst_ratio, &avg_ratio); if (best_map == NULL || worst_ratio < best_worst_ratio) { if (best_map != NULL) free_map(best_map); best_map = map; best_worst_ratio = worst_ratio; best_avg_ratio = avg_ratio; } else { free_map(map); } } /* * After determining the best map generate a checksum over the full * permutation array. This checksum is verified when opening a dRAID * pool to ensure the generated in memory permutations are correct. */ zio_cksum_t cksum; fletcher_4_native_varsize(best_map->dm_perms, sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms, &cksum); best_map->dm_checksum = cksum.zc_word[0]; *best_mapp = best_map; *best_ratiop = best_worst_ratio; *avg_ratiop = best_avg_ratio; return (0); } static int draid_generate(int argc, char *argv[]) { char filename[MAXPATHLEN] = {0}; uint64_t map_seed; int c, fd, error, verbose = 0, passes = 1, continuous = 0; int min_children = VDEV_DRAID_MIN_CHILDREN; int max_children = VDEV_DRAID_MAX_CHILDREN; int restarts = 0; while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) { switch (c) { case 'c': continuous++; break; case 'm': min_children = (int)strtol(optarg, NULL, 0); if (min_children < VDEV_DRAID_MIN_CHILDREN) { (void) fprintf(stderr, "A minimum of 2 " "children are required.\n"); return (1); } break; case 'n': max_children = (int)strtol(optarg, NULL, 0); if (max_children > VDEV_DRAID_MAX_CHILDREN) { (void) fprintf(stderr, "A maximum of %d " "children are allowed.\n", VDEV_DRAID_MAX_CHILDREN); return (1); } break; case 'p': passes = (int)strtol(optarg, NULL, 0); break; case 'v': /* * 0 - Only log when a better map is added to the file. * 1 - Log the current best map for each child count. * Minimal output on a single summary line. * 2 - Log the current best map for each child count. * More verbose includes most map fields. * 3 - Log the current best map for each child count. * Very verbose all fields including the full map. */ verbose++; break; case ':': (void) fprintf(stderr, "missing argument for '%c' option\n", optopt); draid_usage(); break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); draid_usage(); break; } } if (argc > optind) - strncpy(filename, argv[optind], MAXPATHLEN - 1); + strlcpy(filename, argv[optind], sizeof (filename)); else { (void) fprintf(stderr, "A FILE must be specified.\n"); return (1); } restart: /* * Start with a fresh seed from /dev/urandom. */ fd = open("/dev/urandom", O_RDONLY); if (fd < 0) { printf("Unable to open /dev/urandom: %s\n:", strerror(errno)); return (1); } else { ssize_t bytes = sizeof (map_seed); ssize_t bytes_read = 0; while (bytes_read < bytes) { ssize_t rc = read(fd, ((char *)&map_seed) + bytes_read, bytes - bytes_read); if (rc < 0) { printf("Unable to read /dev/urandom: %s\n:", strerror(errno)); close(fd); return (1); } bytes_read += rc; } (void) close(fd); } if (restarts == 0) printf("Writing generated mappings to '%s':\n", filename); /* * Generate maps for all requested child counts. The best map for * each child count is written out to the specified file. If the file * already contains a better mapping this map will not be added. */ for (uint64_t children = min_children; children <= max_children; children++) { char key[8] = { 0 }; draid_map_t *map; double worst_ratio = 1000.0; double avg_ratio = 1000.0; error = eval_maps(children, passes, &map_seed, &map, &worst_ratio, &avg_ratio); if (error) { printf("Error eval_maps(): %s\n", strerror(error)); return (1); } if (worst_ratio < 1.0 || avg_ratio < 1.0) { printf("Error ratio < 1.0: worst_ratio = %2.03f " "avg_ratio = %2.03f\n", worst_ratio, avg_ratio); return (1); } snprintf(key, 7, "%llu", (u_longlong_t)children); error = write_map_key(filename, key, map, worst_ratio, avg_ratio); if (error == 0) { /* The new map was added to the file. */ dump_map(map, key, worst_ratio, avg_ratio, MAX(verbose, 1)); } else if (error == EEXIST) { /* The existing map was preferable and kept. */ if (verbose > 0) dump_map_key(filename, key, verbose); } else { printf("Error write_map_key(): %s\n", strerror(error)); return (1); } free_map(map); } /* * When the continuous option is set restart at the minimum number of * children instead of exiting. This option is useful as a mechanism * to continuous try and refine the discovered permutations. */ if (continuous) { restarts++; printf("Restarting by request (-c): %d\n", restarts); goto restart; } return (0); } /* * Verify each map in the file by generating its in-memory permutation array * and comfirming its checksum is correct. */ static int draid_verify(int argc, char *argv[]) { char filename[MAXPATHLEN] = {0}; int n = 0, c, error, verbose = 1; int check_ratios = 0; while ((c = getopt(argc, argv, ":rv")) != -1) { switch (c) { case 'r': check_ratios++; break; case 'v': verbose++; break; case ':': (void) fprintf(stderr, "missing argument for '%c' option\n", optopt); draid_usage(); break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); draid_usage(); break; } } if (argc > optind) { char *abspath = malloc(MAXPATHLEN); if (abspath == NULL) return (ENOMEM); if (realpath(argv[optind], abspath) != NULL) - strncpy(filename, abspath, MAXPATHLEN - 1); + strlcpy(filename, abspath, sizeof (filename)); else - strncpy(filename, argv[optind], MAXPATHLEN - 1); + strlcpy(filename, argv[optind], sizeof (filename)); free(abspath); } else { (void) fprintf(stderr, "A FILE must be specified.\n"); return (1); } printf("Verifying permutation maps: '%s'\n", filename); /* * Lookup hardcoded permutation map for each valid number of children * and verify a generated map has the correct checksum. Then compare * the generated map values with the nvlist map values read from the * reference file to cross-check the permutation. */ for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; children <= VDEV_DRAID_MAX_CHILDREN; children++) { draid_map_t *map; char key[8] = {0}; snprintf(key, 8, "%llu", (u_longlong_t)children); error = alloc_fixed_map(children, &map); if (error) { printf("Error alloc_fixed_map() failed: %s\n", error == ECKSUM ? "Invalid checksum" : strerror(error)); return (1); } uint64_t nv_seed, nv_checksum, nv_children, nv_nperms; uint8_t *nv_perms; nvlist_t *cfg; uint_t c; error = read_map_key(filename, key, &cfg); if (error != 0) { printf("Error read_map_key() failed: %s\n", strerror(error)); free_map(map); return (1); } nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c); /* * Compare draid_map_t and nvlist reference values. */ if (map->dm_seed != nv_seed) { printf("Error different seeds: 0x%016llx != " "0x%016llx\n", (u_longlong_t)map->dm_seed, (u_longlong_t)nv_seed); error = EINVAL; } if (map->dm_checksum != nv_checksum) { printf("Error different checksums: 0x%016llx " "!= 0x%016llx\n", (u_longlong_t)map->dm_checksum, (u_longlong_t)nv_checksum); error = EINVAL; } if (map->dm_children != nv_children) { printf("Error different children: %llu " "!= %llu\n", (u_longlong_t)map->dm_children, (u_longlong_t)nv_children); error = EINVAL; } if (map->dm_nperms != nv_nperms) { printf("Error different nperms: %llu " "!= %llu\n", (u_longlong_t)map->dm_nperms, (u_longlong_t)nv_nperms); error = EINVAL; } for (uint64_t i = 0; i < nv_children * nv_nperms; i++) { if (map->dm_perms[i] != nv_perms[i]) { printf("Error different perms[%llu]: " "%d != %d\n", (u_longlong_t)i, (int)map->dm_perms[i], (int)nv_perms[i]); error = EINVAL; break; } } /* * For good measure recalculate the worst and average * ratios and confirm they match the nvlist values. */ if (check_ratios) { uint64_t nv_worst_ratio, nv_avg_ratio; double worst_ratio, avg_ratio; eval_decluster(map, &worst_ratio, &avg_ratio); nv_worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO); nv_avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); if (worst_ratio < 1.0 || avg_ratio < 1.0) { printf("Error ratio out of range %2.03f, " "%2.03f\n", worst_ratio, avg_ratio); error = EINVAL; } if ((uint64_t)(worst_ratio * 1000.0) != nv_worst_ratio) { printf("Error different worst_ratio %2.03f " "!= %2.03f\n", (double)nv_worst_ratio / 1000.0, worst_ratio); error = EINVAL; } if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) { printf("Error different average_ratio %2.03f " "!= %2.03f\n", (double)nv_avg_ratio / 1000.0, avg_ratio); error = EINVAL; } } if (error) { free_map(map); nvlist_free(cfg); return (1); } if (verbose > 0) { printf("- %llu children: good\n", (u_longlong_t)children); } n++; free_map(map); nvlist_free(cfg); } if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) { printf("Error permutation maps missing: %d / %d checked\n", n, VDEV_DRAID_MAX_CHILDREN - 1); return (1); } printf("Successfully verified %d / %d permutation maps\n", n, VDEV_DRAID_MAX_CHILDREN - 1); return (0); } /* * Dump the contents of the specified mapping(s) for inspection. */ static int draid_dump(int argc, char *argv[]) { char filename[MAXPATHLEN] = {0}; int c, error, verbose = 1; int min_children = VDEV_DRAID_MIN_CHILDREN; int max_children = VDEV_DRAID_MAX_CHILDREN; while ((c = getopt(argc, argv, ":vm:n:")) != -1) { switch (c) { case 'm': min_children = (int)strtol(optarg, NULL, 0); if (min_children < 2) { (void) fprintf(stderr, "A minimum of 2 " "children are required.\n"); return (1); } break; case 'n': max_children = (int)strtol(optarg, NULL, 0); if (max_children > VDEV_DRAID_MAX_CHILDREN) { (void) fprintf(stderr, "A maximum of %d " "children are allowed.\n", VDEV_DRAID_MAX_CHILDREN); return (1); } break; case 'v': verbose++; break; case ':': (void) fprintf(stderr, "missing argument for '%c' option\n", optopt); draid_usage(); break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); draid_usage(); break; } } if (argc > optind) - strncpy(filename, argv[optind], MAXPATHLEN - 1); + strlcpy(filename, argv[optind], sizeof (filename)); else { (void) fprintf(stderr, "A FILE must be specified.\n"); return (1); } /* * Dump maps for the requested child counts. */ for (uint64_t children = min_children; children <= max_children; children++) { char key[8] = { 0 }; snprintf(key, 7, "%llu", (u_longlong_t)children); error = dump_map_key(filename, key, verbose); if (error) { printf("Error dump_map_key(): %s\n", strerror(error)); return (1); } } return (0); } /* * Print all of the mappings as a C formatted draid_map_t array. This table * is found in the module/zcommon/zfs_draid.c file and is the definitive * source for all mapping used by dRAID. It cannot be updated without * changing the dRAID on disk format. */ static int draid_table(int argc, char *argv[]) { char filename[MAXPATHLEN] = {0}; int error; if (argc > optind) - strncpy(filename, argv[optind], MAXPATHLEN - 1); + strlcpy(filename, argv[optind], sizeof (filename)); else { (void) fprintf(stderr, "A FILE must be specified.\n"); return (1); } printf("static const draid_map_t " "draid_maps[VDEV_DRAID_MAX_MAPS] = {\n"); for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; children <= VDEV_DRAID_MAX_CHILDREN; children++) { uint64_t seed, checksum, nperms, avg_ratio; nvlist_t *cfg; char key[8] = {0}; snprintf(key, 8, "%llu", (u_longlong_t)children); error = read_map_key(filename, key, &cfg); if (error != 0) { printf("Error read_map_key() failed: %s\n", strerror(error)); return (1); } seed = fnvlist_lookup_uint64(cfg, MAP_SEED); checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t" "/* %2.03f */\n", (u_longlong_t)children, (u_longlong_t)nperms, (u_longlong_t)seed, (u_longlong_t)checksum, (double)avg_ratio / 1000.0); nvlist_free(cfg); } printf("};\n"); return (0); } static int draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp) { nvlist_t *srccfgs; nvpair_t *elem = NULL; int error, merged = 0; error = read_map(srcfilename, &srccfgs); if (error != 0) return (error); while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) { uint64_t nv_worst_ratio; uint64_t allcfg_worst_ratio; nvlist_t *cfg, *allcfg; char *key; switch (nvpair_type(elem)) { case DATA_TYPE_NVLIST: (void) nvpair_value_nvlist(elem, &cfg); key = nvpair_name(elem); nv_worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO); error = nvlist_lookup_nvlist(allcfgs, key, &allcfg); if (error == 0) { allcfg_worst_ratio = fnvlist_lookup_uint64( allcfg, MAP_WORST_RATIO); if (nv_worst_ratio < allcfg_worst_ratio) { fnvlist_remove(allcfgs, key); error = nvlist_add_nvlist(allcfgs, key, cfg); merged++; } } else if (error == ENOENT) { error = nvlist_add_nvlist(allcfgs, key, cfg); merged++; } else { return (error); } break; default: continue; } } nvlist_free(srccfgs); *mergedp = merged; return (0); } /* * Merge the best map for each child count found in the listed files into * a new file. This allows 'draid generate' to be run in parallel and for * the results maps to be combined. */ static int draid_merge(int argc, char *argv[]) { char filename[MAXPATHLEN] = {0}; int c, error, total_merged = 0; nvlist_t *allcfgs; while ((c = getopt(argc, argv, ":")) != -1) { switch (c) { case ':': (void) fprintf(stderr, "missing argument for '%c' option\n", optopt); draid_usage(); break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); draid_usage(); break; } } if (argc < 4) { (void) fprintf(stderr, "A FILE and multiple SRCs must be specified.\n"); return (1); } - strncpy(filename, argv[optind], MAXPATHLEN - 1); + strlcpy(filename, argv[optind], sizeof (filename)); optind++; error = read_map(filename, &allcfgs); if (error == ENOENT) { allcfgs = fnvlist_alloc(); } else if (error != 0) { printf("Error read_map(): %s\n", strerror(error)); return (error); } while (optind < argc) { char srcfilename[MAXPATHLEN] = {0}; int merged = 0; - strncpy(srcfilename, argv[optind], MAXPATHLEN - 1); + strlcpy(srcfilename, argv[optind], sizeof (srcfilename)); error = draid_merge_impl(allcfgs, srcfilename, &merged); if (error) { printf("Error draid_merge_impl(): %s\n", strerror(error)); nvlist_free(allcfgs); return (1); } total_merged += merged; printf("Merged %d key(s) from '%s' into '%s'\n", merged, srcfilename, filename); optind++; } if (total_merged > 0) write_map(filename, allcfgs); printf("Merged a total of %d key(s) into '%s'\n", total_merged, filename); nvlist_free(allcfgs); return (0); } int main(int argc, char *argv[]) { if (argc < 2) draid_usage(); char *subcommand = argv[1]; if (strcmp(subcommand, "generate") == 0) { return (draid_generate(argc - 1, argv + 1)); } else if (strcmp(subcommand, "verify") == 0) { return (draid_verify(argc - 1, argv + 1)); } else if (strcmp(subcommand, "dump") == 0) { return (draid_dump(argc - 1, argv + 1)); } else if (strcmp(subcommand, "table") == 0) { return (draid_table(argc - 1, argv + 1)); } else if (strcmp(subcommand, "merge") == 0) { return (draid_merge(argc - 1, argv + 1)); } else { draid_usage(); } } diff --git a/tests/zfs-tests/cmd/zfs_diff-socket.c b/tests/zfs-tests/cmd/zfs_diff-socket.c index be4bf31dde9f..3ebc95799fe3 100644 --- a/tests/zfs-tests/cmd/zfs_diff-socket.c +++ b/tests/zfs-tests/cmd/zfs_diff-socket.c @@ -1,57 +1,56 @@ /* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright 2017, loli10K . All rights reserved. */ #include #include #include #include #include #include #include #include #include #include int main(int argc, char *argv[]) { struct sockaddr_un sock; int fd; char *path; size_t size; if (argc != 2) { fprintf(stderr, "usage: %s /path/to/socket\n", argv[0]); exit(1); } path = argv[1]; size = sizeof (sock.sun_path); - strncpy(sock.sun_path, (char *)path, size - 1); - sock.sun_path[size - 1] = '\0'; + (void) snprintf(sock.sun_path, size, "%s", path); sock.sun_family = AF_UNIX; if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) == -1) { perror("socket"); return (1); } if (bind(fd, (struct sockaddr *)&sock, sizeof (struct sockaddr_un))) { perror("bind"); return (1); } if (close(fd)) { perror("close"); return (1); } return (0); }