diff --git a/sys/contrib/openzfs/.cirrus.yml b/sys/contrib/openzfs/.cirrus.yml new file mode 100644 index 000000000000..18b292289e20 --- /dev/null +++ b/sys/contrib/openzfs/.cirrus.yml @@ -0,0 +1,21 @@ +env: + CIRRUS_CLONE_DEPTH: 1 + ARCH: amd64 + +build_task: + matrix: + freebsd_instance: + image_family: freebsd-12-4 + freebsd_instance: + image_family: freebsd-13-2 + freebsd_instance: + image_family: freebsd-14-0-snap + prepare_script: + - pkg install -y autoconf automake libtool gettext-runtime gmake ksh93 py39-packaging py39-cffi py39-sysctl + configure_script: + - env MAKE=gmake ./autogen.sh + - env MAKE=gmake ./configure --with-config="user" --with-python=3.9 + build_script: + - gmake -j `sysctl -n kern.smp.cpus` + install_script: + - gmake install diff --git a/sys/contrib/openzfs/.gitignore b/sys/contrib/openzfs/.gitignore index 8d91dd9466c5..1ef47d921c28 100644 --- a/sys/contrib/openzfs/.gitignore +++ b/sys/contrib/openzfs/.gitignore @@ -1,88 +1,88 @@ # # This is the top-level .gitignore file: # ignore everything except a list of allowed files. # # This is not the place for entries that are specific to # a subdirectory. Instead add those files to the # .gitignore file in that subdirectory. # # N.B. # Please use 'git ls-files -i --exclude-standard' # command after changing this file, to see if there are # any tracked files which get ignored after the change. * !.github !cmd !config !contrib !etc !include !lib !man !module !rpm !scripts !tests !udev !.github/** !cmd/** !config/** !contrib/** !etc/** !include/** !lib/** !man/** !module/** !rpm/** !scripts/** !tests/** !udev/** !.editorconfig +!.cirrus.yml !.gitignore !.gitmodules !AUTHORS !autogen.sh !CODE_OF_CONDUCT.md !configure.ac !copy-builtin !COPYRIGHT !LICENSE !Makefile.am !META !NEWS !NOTICE !README.md !RELEASES.md !TEST !zfs.release.in - # # Normal rules # *.[oa] *.o.ur-safe *.lo *.la *.mod.c *~ *.swp *.gcno *.gcda *.pyc *.pyo .deps .libs .dirstamp .DS_Store modules.order Makefile Makefile.in *.patch *.orig *.tmp *.log diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c index d64fdfa5ba4c..5507f9d3fd67 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c @@ -1,11222 +1,11237 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2012 by Frederik Wessels. All rights reserved. * Copyright (c) 2012 by Cyril Plisko. All rights reserved. * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. * Copyright 2016 Igor Kozhukhov . * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K * Copyright (c) 2021, Colm Buckley * Copyright (c) 2021, Klara Inc. * Copyright [2021] Hewlett Packard Enterprise Development LP */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zpool_util.h" #include "zfs_comutil.h" #include "zfeature_common.h" #include "statcommon.h" libzfs_handle_t *g_zfs; static int zpool_do_create(int, char **); static int zpool_do_destroy(int, char **); static int zpool_do_add(int, char **); static int zpool_do_remove(int, char **); static int zpool_do_labelclear(int, char **); static int zpool_do_checkpoint(int, char **); static int zpool_do_list(int, char **); static int zpool_do_iostat(int, char **); static int zpool_do_status(int, char **); static int zpool_do_online(int, char **); static int zpool_do_offline(int, char **); static int zpool_do_clear(int, char **); static int zpool_do_reopen(int, char **); static int zpool_do_reguid(int, char **); static int zpool_do_attach(int, char **); static int zpool_do_detach(int, char **); static int zpool_do_replace(int, char **); static int zpool_do_split(int, char **); static int zpool_do_initialize(int, char **); static int zpool_do_scrub(int, char **); static int zpool_do_resilver(int, char **); static int zpool_do_trim(int, char **); static int zpool_do_import(int, char **); static int zpool_do_export(int, char **); static int zpool_do_upgrade(int, char **); static int zpool_do_history(int, char **); static int zpool_do_events(int, char **); static int zpool_do_get(int, char **); static int zpool_do_set(int, char **); static int zpool_do_sync(int, char **); static int zpool_do_version(int, char **); static int zpool_do_wait(int, char **); static int zpool_do_help(int argc, char **argv); static zpool_compat_status_t zpool_do_load_compat( const char *, boolean_t *); /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ #ifdef DEBUG const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } #endif typedef enum { HELP_ADD, HELP_ATTACH, HELP_CLEAR, HELP_CREATE, HELP_CHECKPOINT, HELP_DESTROY, HELP_DETACH, HELP_EXPORT, HELP_HISTORY, HELP_IMPORT, HELP_IOSTAT, HELP_LABELCLEAR, HELP_LIST, HELP_OFFLINE, HELP_ONLINE, HELP_REPLACE, HELP_REMOVE, HELP_INITIALIZE, HELP_SCRUB, HELP_RESILVER, HELP_TRIM, HELP_STATUS, HELP_UPGRADE, HELP_EVENTS, HELP_GET, HELP_SET, HELP_SPLIT, HELP_SYNC, HELP_REGUID, HELP_REOPEN, HELP_VERSION, HELP_WAIT } zpool_help_t; /* * Flags for stats to display with "zpool iostats" */ enum iostat_type { IOS_DEFAULT = 0, IOS_LATENCY = 1, IOS_QUEUES = 2, IOS_L_HISTO = 3, IOS_RQ_HISTO = 4, IOS_COUNT, /* always last element */ }; /* iostat_type entries as bitmasks */ #define IOS_DEFAULT_M (1ULL << IOS_DEFAULT) #define IOS_LATENCY_M (1ULL << IOS_LATENCY) #define IOS_QUEUES_M (1ULL << IOS_QUEUES) #define IOS_L_HISTO_M (1ULL << IOS_L_HISTO) #define IOS_RQ_HISTO_M (1ULL << IOS_RQ_HISTO) /* Mask of all the histo bits */ #define IOS_ANYHISTO_M (IOS_L_HISTO_M | IOS_RQ_HISTO_M) /* * Lookup table for iostat flags to nvlist names. Basically a list * of all the nvlists a flag requires. Also specifies the order in * which data gets printed in zpool iostat. */ static const char *vsx_type_to_nvlist[IOS_COUNT][15] = { [IOS_L_HISTO] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, NULL}, [IOS_LATENCY] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, NULL}, [IOS_QUEUES] = { ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, NULL}, [IOS_RQ_HISTO] = { ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO, ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO, ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO, NULL}, }; /* * Given a cb->cb_flags with a histogram bit set, return the iostat_type. * Right now, only one histo bit is ever set at one time, so we can * just do a highbit64(a) */ #define IOS_HISTO_IDX(a) (highbit64(a & IOS_ANYHISTO_M) - 1) typedef struct zpool_command { const char *name; int (*func)(int, char **); zpool_help_t usage; } zpool_command_t; /* * Master command table. Each ZFS command has a name, associated function, and * usage message. The usage messages need to be internationalized, so we have * to have a function to return the usage message based on a command index. * * These commands are organized according to how they are displayed in the usage * message. An empty command (one with a NULL name) indicates an empty line in * the generic usage message. */ static zpool_command_t command_table[] = { { "version", zpool_do_version, HELP_VERSION }, { NULL }, { "create", zpool_do_create, HELP_CREATE }, { "destroy", zpool_do_destroy, HELP_DESTROY }, { NULL }, { "add", zpool_do_add, HELP_ADD }, { "remove", zpool_do_remove, HELP_REMOVE }, { NULL }, { "labelclear", zpool_do_labelclear, HELP_LABELCLEAR }, { NULL }, { "checkpoint", zpool_do_checkpoint, HELP_CHECKPOINT }, { NULL }, { "list", zpool_do_list, HELP_LIST }, { "iostat", zpool_do_iostat, HELP_IOSTAT }, { "status", zpool_do_status, HELP_STATUS }, { NULL }, { "online", zpool_do_online, HELP_ONLINE }, { "offline", zpool_do_offline, HELP_OFFLINE }, { "clear", zpool_do_clear, HELP_CLEAR }, { "reopen", zpool_do_reopen, HELP_REOPEN }, { NULL }, { "attach", zpool_do_attach, HELP_ATTACH }, { "detach", zpool_do_detach, HELP_DETACH }, { "replace", zpool_do_replace, HELP_REPLACE }, { "split", zpool_do_split, HELP_SPLIT }, { NULL }, { "initialize", zpool_do_initialize, HELP_INITIALIZE }, { "resilver", zpool_do_resilver, HELP_RESILVER }, { "scrub", zpool_do_scrub, HELP_SCRUB }, { "trim", zpool_do_trim, HELP_TRIM }, { NULL }, { "import", zpool_do_import, HELP_IMPORT }, { "export", zpool_do_export, HELP_EXPORT }, { "upgrade", zpool_do_upgrade, HELP_UPGRADE }, { "reguid", zpool_do_reguid, HELP_REGUID }, { NULL }, { "history", zpool_do_history, HELP_HISTORY }, { "events", zpool_do_events, HELP_EVENTS }, { NULL }, { "get", zpool_do_get, HELP_GET }, { "set", zpool_do_set, HELP_SET }, { "sync", zpool_do_sync, HELP_SYNC }, { NULL }, { "wait", zpool_do_wait, HELP_WAIT }, }; #define NCOMMAND (ARRAY_SIZE(command_table)) #define VDEV_ALLOC_CLASS_LOGS "logs" static zpool_command_t *current_command; static zfs_type_t current_prop_type = (ZFS_TYPE_POOL | ZFS_TYPE_VDEV); static char history_str[HIS_MAX_RECORD_LEN]; static boolean_t log_history = B_TRUE; static uint_t timestamp_fmt = NODATE; static const char * get_usage(zpool_help_t idx) { switch (idx) { case HELP_ADD: return (gettext("\tadd [-fgLnP] [-o property=value] " " ...\n")); case HELP_ATTACH: return (gettext("\tattach [-fsw] [-o property=value] " " \n")); case HELP_CLEAR: return (gettext("\tclear [-nF] [device]\n")); case HELP_CREATE: return (gettext("\tcreate [-fnd] [-o property=value] ... \n" "\t [-O file-system-property=value] ... \n" "\t [-m mountpoint] [-R root] ...\n")); case HELP_CHECKPOINT: return (gettext("\tcheckpoint [-d [-w]] ...\n")); case HELP_DESTROY: return (gettext("\tdestroy [-f] \n")); case HELP_DETACH: return (gettext("\tdetach \n")); case HELP_EXPORT: return (gettext("\texport [-af] ...\n")); case HELP_HISTORY: return (gettext("\thistory [-il] [] ...\n")); case HELP_IMPORT: return (gettext("\timport [-d dir] [-D]\n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] " "[-R root] [-F [-n]] -a\n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] " "[-R root] [-F [-n]]\n" "\t [--rewind-to-checkpoint] [newpool]\n")); case HELP_IOSTAT: return (gettext("\tiostat [[[-c [script1,script2,...]" "[-lq]]|[-rw]] [-T d | u] [-ghHLpPvy]\n" "\t [[pool ...]|[pool vdev ...]|[vdev ...]]" " [[-n] interval [count]]\n")); case HELP_LABELCLEAR: return (gettext("\tlabelclear [-f] \n")); case HELP_LIST: return (gettext("\tlist [-gHLpPv] [-o property[,...]] " "[-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_OFFLINE: return (gettext("\toffline [-f] [-t] ...\n")); case HELP_ONLINE: return (gettext("\tonline [-e] ...\n")); case HELP_REPLACE: return (gettext("\treplace [-fsw] [-o property=value] " " [new-device]\n")); case HELP_REMOVE: return (gettext("\tremove [-npsw] ...\n")); case HELP_REOPEN: return (gettext("\treopen [-n] \n")); case HELP_INITIALIZE: return (gettext("\tinitialize [-c | -s | -u] [-w] " "[ ...]\n")); case HELP_SCRUB: return (gettext("\tscrub [-s | -p] [-w] [-e] ...\n")); case HELP_RESILVER: return (gettext("\tresilver ...\n")); case HELP_TRIM: return (gettext("\ttrim [-dw] [-r ] [-c | -s] " "[ ...]\n")); case HELP_STATUS: return (gettext("\tstatus [-c [script1,script2,...]] " "[-igLpPstvxD] [-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_UPGRADE: return (gettext("\tupgrade\n" "\tupgrade -v\n" "\tupgrade [-V version] <-a | pool ...>\n")); case HELP_EVENTS: return (gettext("\tevents [-vHf [pool] | -c]\n")); case HELP_GET: return (gettext("\tget [-Hp] [-o \"all\" | field[,...]] " "<\"all\" | property[,...]> ...\n")); case HELP_SET: return (gettext("\tset \n" "\tset \n")); case HELP_SPLIT: return (gettext("\tsplit [-gLnPl] [-R altroot] [-o mntopts]\n" "\t [-o property=value] " "[ ...]\n")); case HELP_REGUID: return (gettext("\treguid \n")); case HELP_SYNC: return (gettext("\tsync [pool] ...\n")); case HELP_VERSION: return (gettext("\tversion\n")); case HELP_WAIT: return (gettext("\twait [-Hp] [-T d|u] [-t [,...]] " " [interval]\n")); default: __builtin_unreachable(); } } static void zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res) { uint_t children = 0; nvlist_t **child; uint_t i; (void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children); if (children == 0) { char *path = zpool_vdev_name(g_zfs, zhp, nvroot, VDEV_NAME_PATH); if (strcmp(path, VDEV_TYPE_INDIRECT) != 0 && strcmp(path, VDEV_TYPE_HOLE) != 0) fnvlist_add_boolean(res, path); free(path); return; } for (i = 0; i < children; i++) { zpool_collect_leaves(zhp, child[i], res); } } /* * Callback routine that will print out a pool property value. */ static int print_pool_prop_cb(int prop, void *cb) { FILE *fp = cb; (void) fprintf(fp, "\t%-19s ", zpool_prop_to_name(prop)); if (zpool_prop_readonly(prop)) (void) fprintf(fp, " NO "); else (void) fprintf(fp, " YES "); if (zpool_prop_values(prop) == NULL) (void) fprintf(fp, "-\n"); else (void) fprintf(fp, "%s\n", zpool_prop_values(prop)); return (ZPROP_CONT); } /* * Callback routine that will print out a vdev property value. */ static int print_vdev_prop_cb(int prop, void *cb) { FILE *fp = cb; (void) fprintf(fp, "\t%-19s ", vdev_prop_to_name(prop)); if (vdev_prop_readonly(prop)) (void) fprintf(fp, " NO "); else (void) fprintf(fp, " YES "); if (vdev_prop_values(prop) == NULL) (void) fprintf(fp, "-\n"); else (void) fprintf(fp, "%s\n", vdev_prop_values(prop)); return (ZPROP_CONT); } /* * Display usage message. If we're inside a command, display only the usage for * that command. Otherwise, iterate over the entire command table and display * a complete usage message. */ static __attribute__((noreturn)) void usage(boolean_t requested) { FILE *fp = requested ? stdout : stderr; if (current_command == NULL) { int i; (void) fprintf(fp, gettext("usage: zpool command args ...\n")); (void) fprintf(fp, gettext("where 'command' is one of the following:\n\n")); for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) (void) fprintf(fp, "\n"); else (void) fprintf(fp, "%s", get_usage(command_table[i].usage)); } (void) fprintf(fp, gettext("\nFor further help on a command or topic, " "run: %s\n"), "zpool help []"); } else { (void) fprintf(fp, gettext("usage:\n")); (void) fprintf(fp, "%s", get_usage(current_command->usage)); } if (current_command != NULL && current_prop_type != (ZFS_TYPE_POOL | ZFS_TYPE_VDEV) && ((strcmp(current_command->name, "set") == 0) || (strcmp(current_command->name, "get") == 0) || (strcmp(current_command->name, "list") == 0))) { (void) fprintf(fp, "%s", gettext("\nthe following properties are supported:\n")); (void) fprintf(fp, "\n\t%-19s %s %s\n\n", "PROPERTY", "EDIT", "VALUES"); /* Iterate over all properties */ if (current_prop_type == ZFS_TYPE_POOL) { (void) zprop_iter(print_pool_prop_cb, fp, B_FALSE, B_TRUE, current_prop_type); (void) fprintf(fp, "\t%-19s ", "feature@..."); (void) fprintf(fp, "YES " "disabled | enabled | active\n"); (void) fprintf(fp, gettext("\nThe feature@ properties " "must be appended with a feature name.\n" "See zpool-features(7).\n")); } else if (current_prop_type == ZFS_TYPE_VDEV) { (void) zprop_iter(print_vdev_prop_cb, fp, B_FALSE, B_TRUE, current_prop_type); } } /* * See comments at end of main(). */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } exit(requested ? 0 : 2); } /* * zpool initialize [-c | -s | -u] [-w] [ ...] * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool * if none specified. * * -c Cancel. Ends active initializing. * -s Suspend. Initializing can then be restarted with no flags. * -u Uninitialize. Clears initialization state. * -w Wait. Blocks until initializing has completed. */ int zpool_do_initialize(int argc, char **argv) { int c; char *poolname; zpool_handle_t *zhp; nvlist_t *vdevs; int err = 0; boolean_t wait = B_FALSE; struct option long_options[] = { {"cancel", no_argument, NULL, 'c'}, {"suspend", no_argument, NULL, 's'}, {"uninit", no_argument, NULL, 'u'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; pool_initialize_func_t cmd_type = POOL_INITIALIZE_START; while ((c = getopt_long(argc, argv, "csuw", long_options, NULL)) != -1) { switch (c) { case 'c': if (cmd_type != POOL_INITIALIZE_START && cmd_type != POOL_INITIALIZE_CANCEL) { (void) fprintf(stderr, gettext("-c cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_INITIALIZE_CANCEL; break; case 's': if (cmd_type != POOL_INITIALIZE_START && cmd_type != POOL_INITIALIZE_SUSPEND) { (void) fprintf(stderr, gettext("-s cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_INITIALIZE_SUSPEND; break; case 'u': if (cmd_type != POOL_INITIALIZE_START && cmd_type != POOL_INITIALIZE_UNINIT) { (void) fprintf(stderr, gettext("-u cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_INITIALIZE_UNINIT; break; case 'w': wait = B_TRUE; break; case '?': if (optopt != 0) { (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } else { (void) fprintf(stderr, gettext("invalid option '%s'\n"), argv[optind - 1]); } usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); return (-1); } if (wait && (cmd_type != POOL_INITIALIZE_START)) { (void) fprintf(stderr, gettext("-w cannot be used with -c, -s" "or -u\n")); usage(B_FALSE); } poolname = argv[0]; zhp = zpool_open(g_zfs, poolname); if (zhp == NULL) return (-1); vdevs = fnvlist_alloc(); if (argc == 1) { /* no individual leaf vdevs specified, so add them all */ nvlist_t *config = zpool_get_config(zhp, NULL); nvlist_t *nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); zpool_collect_leaves(zhp, nvroot, vdevs); } else { for (int i = 1; i < argc; i++) { fnvlist_add_boolean(vdevs, argv[i]); } } if (wait) err = zpool_initialize_wait(zhp, cmd_type, vdevs); else err = zpool_initialize(zhp, cmd_type, vdevs); fnvlist_free(vdevs); zpool_close(zhp); return (err); } /* * print a pool vdev config for dry runs */ static void print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, const char *match, int name_flags) { nvlist_t **child; uint_t c, children; char *vname; boolean_t printed = B_FALSE; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { if (name != NULL) (void) printf("\t%*s%s\n", indent, "", name); return; } for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE, is_hole = B_FALSE; const char *class = ""; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_hole == B_TRUE) { continue; } (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) class = VDEV_ALLOC_BIAS_LOG; (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &class); if (strcmp(match, class) != 0) continue; if (!printed && name != NULL) { (void) printf("\t%*s%s\n", indent, "", name); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, child[c], name_flags); print_vdev_tree(zhp, vname, child[c], indent + 2, "", name_flags); free(vname); } } /* * Print the list of l2cache devices for dry runs. */ static void print_cache_list(nvlist_t *nv, int indent) { nvlist_t **child; uint_t c, children; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0 && children > 0) { (void) printf("\t%*s%s\n", indent, "", "cache"); } else { return; } for (c = 0; c < children; c++) { char *vname; vname = zpool_vdev_name(g_zfs, NULL, child[c], 0); (void) printf("\t%*s%s\n", indent + 2, "", vname); free(vname); } } /* * Print the list of spares for dry runs. */ static void print_spare_list(nvlist_t *nv, int indent) { nvlist_t **child; uint_t c, children; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0 && children > 0) { (void) printf("\t%*s%s\n", indent, "", "spares"); } else { return; } for (c = 0; c < children; c++) { char *vname; vname = zpool_vdev_name(g_zfs, NULL, child[c], 0); (void) printf("\t%*s%s\n", indent + 2, "", vname); free(vname); } } static boolean_t prop_list_contains_feature(nvlist_t *proplist) { nvpair_t *nvp; for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp; nvp = nvlist_next_nvpair(proplist, nvp)) { if (zpool_prop_feature(nvpair_name(nvp))) return (B_TRUE); } return (B_FALSE); } /* * Add a property pair (name, string-value) into a property nvlist. */ static int add_prop_list(const char *propname, const char *propval, nvlist_t **props, boolean_t poolprop) { zpool_prop_t prop = ZPOOL_PROP_INVAL; nvlist_t *proplist; const char *normnm; const char *strval; if (*props == NULL && nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) { (void) fprintf(stderr, gettext("internal error: out of memory\n")); return (1); } proplist = *props; if (poolprop) { const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION); const char *cname = zpool_prop_to_name(ZPOOL_PROP_COMPATIBILITY); if ((prop = zpool_name_to_prop(propname)) == ZPOOL_PROP_INVAL && (!zpool_prop_feature(propname) && !zpool_prop_vdev(propname))) { (void) fprintf(stderr, gettext("property '%s' is " "not a valid pool or vdev property\n"), propname); return (2); } /* * feature@ properties and version should not be specified * at the same time. */ if ((prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname) && nvlist_exists(proplist, vname)) || (prop == ZPOOL_PROP_VERSION && prop_list_contains_feature(proplist))) { (void) fprintf(stderr, gettext("'feature@' and " "'version' properties cannot be specified " "together\n")); return (2); } /* * if version is specified, only "legacy" compatibility * may be requested */ if ((prop == ZPOOL_PROP_COMPATIBILITY && strcmp(propval, ZPOOL_COMPAT_LEGACY) != 0 && nvlist_exists(proplist, vname)) || (prop == ZPOOL_PROP_VERSION && nvlist_exists(proplist, cname) && strcmp(fnvlist_lookup_string(proplist, cname), ZPOOL_COMPAT_LEGACY) != 0)) { (void) fprintf(stderr, gettext("when 'version' is " "specified, the 'compatibility' feature may only " "be set to '" ZPOOL_COMPAT_LEGACY "'\n")); return (2); } if (zpool_prop_feature(propname) || zpool_prop_vdev(propname)) normnm = propname; else normnm = zpool_prop_to_name(prop); } else { zfs_prop_t fsprop = zfs_name_to_prop(propname); if (zfs_prop_valid_for_type(fsprop, ZFS_TYPE_FILESYSTEM, B_FALSE)) { normnm = zfs_prop_to_name(fsprop); } else if (zfs_prop_user(propname) || zfs_prop_userquota(propname)) { normnm = propname; } else { (void) fprintf(stderr, gettext("property '%s' is " "not a valid filesystem property\n"), propname); return (2); } } if (nvlist_lookup_string(proplist, normnm, &strval) == 0 && prop != ZPOOL_PROP_CACHEFILE) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (2); } if (nvlist_add_string(proplist, normnm, propval) != 0) { (void) fprintf(stderr, gettext("internal " "error: out of memory\n")); return (1); } return (0); } /* * Set a default property pair (name, string-value) in a property nvlist */ static int add_prop_list_default(const char *propname, const char *propval, nvlist_t **props) { const char *pval; if (nvlist_lookup_string(*props, propname, &pval) == 0) return (0); return (add_prop_list(propname, propval, props, B_TRUE)); } /* * zpool add [-fgLnP] [-o property=value] ... * * -f Force addition of devices, even if they appear in use * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -n Do not add the devices, but display the resulting layout if * they were to be added. * -o Set property=value. * -P Display full path for vdev name. * * Adds the given vdevs to 'pool'. As with create, the bulk of this work is * handled by make_root_vdev(), which constructs the nvlist needed to pass to * libzfs. */ int zpool_do_add(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; int name_flags = 0; int c; nvlist_t *nvroot; char *poolname; int ret; zpool_handle_t *zhp; nvlist_t *config; nvlist_t *props = NULL; char *propval; /* check options */ while ((c = getopt(argc, argv, "fgLno:P")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'g': name_flags |= VDEV_NAME_GUID; break; case 'L': name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'n': dryrun = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); usage(B_FALSE); } *propval = '\0'; propval++; if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; case 'P': name_flags |= VDEV_NAME_PATH; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing vdev specification\n")); usage(B_FALSE); } poolname = argv[0]; argc--; argv++; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); if ((config = zpool_get_config(zhp, NULL)) == NULL) { (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), poolname); zpool_close(zhp); return (1); } /* unless manually specified use "ashift" pool property (if set) */ if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) { int intval; zprop_source_t src; char strval[ZPOOL_MAXPROPLEN]; intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src); if (src != ZPROP_SRC_DEFAULT) { (void) sprintf(strval, "%" PRId32, intval); verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval, &props, B_TRUE) == 0); } } /* pass off to make_root_vdev for processing */ nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun, argc, argv); if (nvroot == NULL) { zpool_close(zhp); return (1); } if (dryrun) { nvlist_t *poolnvroot; nvlist_t **l2child, **sparechild; uint_t l2children, sparechildren, c; char *vname; boolean_t hadcache = B_FALSE, hadspare = B_FALSE; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &poolnvroot) == 0); (void) printf(gettext("would update '%s' to the following " "configuration:\n\n"), zpool_get_name(zhp)); /* print original main pool and new tree */ print_vdev_tree(zhp, poolname, poolnvroot, 0, "", name_flags | VDEV_NAME_TYPE_ID); print_vdev_tree(zhp, NULL, nvroot, 0, "", name_flags); /* print other classes: 'dedup', 'special', and 'log' */ if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_DEDUP)) { print_vdev_tree(zhp, "dedup", poolnvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_DEDUP)) { print_vdev_tree(zhp, "dedup", nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); } if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_SPECIAL)) { print_vdev_tree(zhp, "special", poolnvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_SPECIAL)) { print_vdev_tree(zhp, "special", nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); } if (num_logs(poolnvroot) > 0) { print_vdev_tree(zhp, "logs", poolnvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); } else if (num_logs(nvroot) > 0) { print_vdev_tree(zhp, "logs", nvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); } /* Do the same for the caches */ if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_L2CACHE, &l2child, &l2children) == 0 && l2children) { hadcache = B_TRUE; (void) printf(gettext("\tcache\n")); for (c = 0; c < l2children; c++) { vname = zpool_vdev_name(g_zfs, NULL, l2child[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2child, &l2children) == 0 && l2children) { if (!hadcache) (void) printf(gettext("\tcache\n")); for (c = 0; c < l2children; c++) { vname = zpool_vdev_name(g_zfs, NULL, l2child[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } /* And finally the spares */ if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_SPARES, &sparechild, &sparechildren) == 0 && sparechildren > 0) { hadspare = B_TRUE; (void) printf(gettext("\tspares\n")); for (c = 0; c < sparechildren; c++) { vname = zpool_vdev_name(g_zfs, NULL, sparechild[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &sparechild, &sparechildren) == 0 && sparechildren > 0) { if (!hadspare) (void) printf(gettext("\tspares\n")); for (c = 0; c < sparechildren; c++) { vname = zpool_vdev_name(g_zfs, NULL, sparechild[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } ret = 0; } else { ret = (zpool_add(zhp, nvroot) != 0); } nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); return (ret); } /* * zpool remove [-npsw] ... * * Removes the given vdev from the pool. */ int zpool_do_remove(int argc, char **argv) { char *poolname; int i, ret = 0; zpool_handle_t *zhp = NULL; boolean_t stop = B_FALSE; int c; boolean_t noop = B_FALSE; boolean_t parsable = B_FALSE; boolean_t wait = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "npsw")) != -1) { switch (c) { case 'n': noop = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 's': stop = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); if (stop && noop) { zpool_close(zhp); (void) fprintf(stderr, gettext("stop request ignored\n")); return (0); } if (stop) { if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (zpool_vdev_remove_cancel(zhp) != 0) ret = 1; if (wait) { (void) fprintf(stderr, gettext("invalid option " "combination: -w cannot be used with -s\n")); usage(B_FALSE); } } else { if (argc < 2) { (void) fprintf(stderr, gettext("missing device\n")); usage(B_FALSE); } for (i = 1; i < argc; i++) { if (noop) { uint64_t size; if (zpool_vdev_indirect_size(zhp, argv[i], &size) != 0) { ret = 1; break; } if (parsable) { (void) printf("%s %llu\n", argv[i], (unsigned long long)size); } else { char valstr[32]; zfs_nicenum(size, valstr, sizeof (valstr)); (void) printf("Memory that will be " "used after removing %s: %s\n", argv[i], valstr); } } else { if (zpool_vdev_remove(zhp, argv[i]) != 0) ret = 1; } } if (ret == 0 && wait) ret = zpool_wait(zhp, ZPOOL_WAIT_REMOVE); } zpool_close(zhp); return (ret); } /* * Return 1 if a vdev is active (being used in a pool) * Return 0 if a vdev is inactive (offlined or faulted, or not in active pool) * * This is useful for checking if a disk in an active pool is offlined or * faulted. */ static int vdev_is_active(char *vdev_path) { int fd; fd = open(vdev_path, O_EXCL); if (fd < 0) { return (1); /* cant open O_EXCL - disk is active */ } close(fd); return (0); /* disk is inactive in the pool */ } /* * zpool labelclear [-f] * * -f Force clearing the label for the vdevs which are members of * the exported or foreign pools. * * Verifies that the vdev is not active and zeros out the label information * on the device. */ int zpool_do_labelclear(int argc, char **argv) { char vdev[MAXPATHLEN]; char *name = NULL; int c, fd = -1, ret = 0; nvlist_t *config; pool_state_t state; boolean_t inuse = B_FALSE; boolean_t force = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get vdev name */ if (argc < 1) { (void) fprintf(stderr, gettext("missing vdev name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } (void) strlcpy(vdev, argv[0], sizeof (vdev)); /* * If we cannot open an absolute path, we quit. * Otherwise if the provided vdev name doesn't point to a file, * try prepending expected disk paths and partition numbers. */ if ((fd = open(vdev, O_RDWR)) < 0) { int error; if (vdev[0] == '/') { (void) fprintf(stderr, gettext("failed to open " "%s: %s\n"), vdev, strerror(errno)); return (1); } error = zfs_resolve_shortname(argv[0], vdev, MAXPATHLEN); if (error == 0 && zfs_dev_is_whole_disk(vdev)) { if (zfs_append_partition(vdev, MAXPATHLEN) == -1) error = ENOENT; } if (error || ((fd = open(vdev, O_RDWR)) < 0)) { if (errno == ENOENT) { (void) fprintf(stderr, gettext( "failed to find device %s, try " "specifying absolute path instead\n"), argv[0]); return (1); } (void) fprintf(stderr, gettext("failed to open %s:" " %s\n"), vdev, strerror(errno)); return (1); } } /* * Flush all dirty pages for the block device. This should not be * fatal when the device does not support BLKFLSBUF as would be the * case for a file vdev. */ if ((zfs_dev_flush(fd) != 0) && (errno != ENOTTY)) (void) fprintf(stderr, gettext("failed to invalidate " "cache for %s: %s\n"), vdev, strerror(errno)); if (zpool_read_label(fd, &config, NULL) != 0) { (void) fprintf(stderr, gettext("failed to read label from %s\n"), vdev); ret = 1; goto errout; } nvlist_free(config); ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse); if (ret != 0) { (void) fprintf(stderr, gettext("failed to check state for %s\n"), vdev); ret = 1; goto errout; } if (!inuse) goto wipe_label; switch (state) { default: case POOL_STATE_ACTIVE: case POOL_STATE_SPARE: case POOL_STATE_L2CACHE: /* * We allow the user to call 'zpool offline -f' * on an offlined disk in an active pool. We can check if * the disk is online by calling vdev_is_active(). */ if (force && !vdev_is_active(vdev)) break; (void) fprintf(stderr, gettext( "%s is a member (%s) of pool \"%s\""), vdev, zpool_pool_state_to_name(state), name); if (force) { (void) fprintf(stderr, gettext( ". Offline the disk first to clear its label.")); } printf("\n"); ret = 1; goto errout; case POOL_STATE_EXPORTED: if (force) break; (void) fprintf(stderr, gettext( "use '-f' to override the following error:\n" "%s is a member of exported pool \"%s\"\n"), vdev, name); ret = 1; goto errout; case POOL_STATE_POTENTIALLY_ACTIVE: if (force) break; (void) fprintf(stderr, gettext( "use '-f' to override the following error:\n" "%s is a member of potentially active pool \"%s\"\n"), vdev, name); ret = 1; goto errout; case POOL_STATE_DESTROYED: /* inuse should never be set for a destroyed pool */ assert(0); break; } wipe_label: ret = zpool_clear_label(fd); if (ret != 0) { (void) fprintf(stderr, gettext("failed to clear label for %s\n"), vdev); } errout: free(name); (void) close(fd); return (ret); } /* * zpool create [-fnd] [-o property=value] ... * [-O file-system-property=value] ... * [-R root] [-m mountpoint] ... * * -f Force creation, even if devices appear in use * -n Do not create the pool, but display the resulting layout if it * were to be created. * -R Create a pool under an alternate root * -m Set default mountpoint for the root dataset. By default it's * '/' * -o Set property=value. * -o Set feature@feature=enabled|disabled. * -d Don't automatically enable all supported pool features * (individual features can be enabled with -o). * -O Set fsproperty=value in the pool's root file system * * Creates the named pool according to the given vdev specification. The * bulk of the vdev processing is done in make_root_vdev() in zpool_vdev.c. * Once we get the nvlist back from make_root_vdev(), we either print out the * contents (if '-n' was specified), or pass it to libzfs to do the creation. */ int zpool_do_create(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; boolean_t enable_pool_features = B_TRUE; int c; nvlist_t *nvroot = NULL; char *poolname; char *tname = NULL; int ret = 1; char *altroot = NULL; char *compat = NULL; char *mountpoint = NULL; nvlist_t *fsprops = NULL; nvlist_t *props = NULL; char *propval; /* check options */ while ((c = getopt(argc, argv, ":fndR:m:o:O:t:")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case 'd': enable_pool_features = B_FALSE; break; case 'R': altroot = optarg; if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) goto errout; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props)) goto errout; break; case 'm': /* Equivalent to -O mountpoint=optarg */ mountpoint = optarg; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); goto errout; } *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE)) goto errout; /* * If the user is creating a pool that doesn't support * feature flags, don't enable any features. */ if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) { char *end; u_longlong_t ver; ver = strtoull(propval, &end, 10); if (*end == '\0' && ver < SPA_VERSION_FEATURES) { enable_pool_features = B_FALSE; } } if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT) altroot = propval; if (zpool_name_to_prop(optarg) == ZPOOL_PROP_COMPATIBILITY) compat = propval; break; case 'O': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -O option\n")); goto errout; } *propval = '\0'; propval++; /* * Mountpoints are checked and then added later. * Uniquely among properties, they can be specified * more than once, to avoid conflict with -m. */ if (0 == strcmp(optarg, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) { mountpoint = propval; } else if (add_prop_list(optarg, propval, &fsprops, B_FALSE)) { goto errout; } break; case 't': /* * Sanity check temporary pool name. */ if (strchr(optarg, '/') != NULL) { (void) fprintf(stderr, gettext("cannot create " "'%s': invalid character '/' in temporary " "name\n"), optarg); (void) fprintf(stderr, gettext("use 'zfs " "create' to create a dataset\n")); goto errout; } if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_TNAME), optarg, &props, B_TRUE)) goto errout; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props)) goto errout; tname = optarg; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); goto badusage; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto badusage; } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); goto badusage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing vdev specification\n")); goto badusage; } poolname = argv[0]; /* * As a special case, check for use of '/' in the name, and direct the * user to use 'zfs create' instead. */ if (strchr(poolname, '/') != NULL) { (void) fprintf(stderr, gettext("cannot create '%s': invalid " "character '/' in pool name\n"), poolname); (void) fprintf(stderr, gettext("use 'zfs create' to " "create a dataset\n")); goto errout; } /* pass off to make_root_vdev for bulk processing */ nvroot = make_root_vdev(NULL, props, force, !force, B_FALSE, dryrun, argc - 1, argv + 1); if (nvroot == NULL) goto errout; /* make_root_vdev() allows 0 toplevel children if there are spares */ if (!zfs_allocatable_devs(nvroot)) { (void) fprintf(stderr, gettext("invalid vdev " "specification: at least one toplevel vdev must be " "specified\n")); goto errout; } if (altroot != NULL && altroot[0] != '/') { (void) fprintf(stderr, gettext("invalid alternate root '%s': " "must be an absolute path\n"), altroot); goto errout; } /* * Check the validity of the mountpoint and direct the user to use the * '-m' mountpoint option if it looks like its in use. */ if (mountpoint == NULL || (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 && strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) { char buf[MAXPATHLEN]; DIR *dirp; if (mountpoint && mountpoint[0] != '/') { (void) fprintf(stderr, gettext("invalid mountpoint " "'%s': must be an absolute path, 'legacy', or " "'none'\n"), mountpoint); goto errout; } if (mountpoint == NULL) { if (altroot != NULL) (void) snprintf(buf, sizeof (buf), "%s/%s", altroot, poolname); else (void) snprintf(buf, sizeof (buf), "/%s", poolname); } else { if (altroot != NULL) (void) snprintf(buf, sizeof (buf), "%s%s", altroot, mountpoint); else (void) snprintf(buf, sizeof (buf), "%s", mountpoint); } if ((dirp = opendir(buf)) == NULL && errno != ENOENT) { (void) fprintf(stderr, gettext("mountpoint '%s' : " "%s\n"), buf, strerror(errno)); (void) fprintf(stderr, gettext("use '-m' " "option to provide a different default\n")); goto errout; } else if (dirp) { int count = 0; while (count < 3 && readdir(dirp) != NULL) count++; (void) closedir(dirp); if (count > 2) { (void) fprintf(stderr, gettext("mountpoint " "'%s' exists and is not empty\n"), buf); (void) fprintf(stderr, gettext("use '-m' " "option to provide a " "different default\n")); goto errout; } } } /* * Now that the mountpoint's validity has been checked, ensure that * the property is set appropriately prior to creating the pool. */ if (mountpoint != NULL) { ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), mountpoint, &fsprops, B_FALSE); if (ret != 0) goto errout; } ret = 1; if (dryrun) { /* * For a dry run invocation, print out a basic message and run * through all the vdevs in the list and print out in an * appropriate hierarchy. */ (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), poolname); print_vdev_tree(NULL, poolname, nvroot, 0, "", 0); print_vdev_tree(NULL, "dedup", nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, 0); print_vdev_tree(NULL, "special", nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, 0); print_vdev_tree(NULL, "logs", nvroot, 0, VDEV_ALLOC_BIAS_LOG, 0); print_cache_list(nvroot, 0); print_spare_list(nvroot, 0); ret = 0; } else { /* * Load in feature set. * Note: if compatibility property not given, we'll have * NULL, which means 'all features'. */ boolean_t requested_features[SPA_FEATURES]; if (zpool_do_load_compat(compat, requested_features) != ZPOOL_COMPATIBILITY_OK) goto errout; /* * props contains list of features to enable. * For each feature: * - remove it if feature@name=disabled * - leave it there if feature@name=enabled * - add it if: * - enable_pool_features (ie: no '-d' or '-o version') * - it's supported by the kernel module * - it's in the requested feature set * - warn if it's enabled but not in compat */ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { char propname[MAXPATHLEN]; const char *propval; zfeature_info_t *feat = &spa_feature_table[i]; (void) snprintf(propname, sizeof (propname), "feature@%s", feat->fi_uname); if (!nvlist_lookup_string(props, propname, &propval)) { if (strcmp(propval, ZFS_FEATURE_DISABLED) == 0) { (void) nvlist_remove_all(props, propname); } else if (strcmp(propval, ZFS_FEATURE_ENABLED) == 0 && !requested_features[i]) { (void) fprintf(stderr, gettext( "Warning: feature \"%s\" enabled " "but is not in specified " "'compatibility' feature set.\n"), feat->fi_uname); } } else if ( enable_pool_features && feat->fi_zfs_mod_supported && requested_features[i]) { ret = add_prop_list(propname, ZFS_FEATURE_ENABLED, &props, B_TRUE); if (ret != 0) goto errout; } } ret = 1; if (zpool_create(g_zfs, poolname, nvroot, props, fsprops) == 0) { zfs_handle_t *pool = zfs_open(g_zfs, tname ? tname : poolname, ZFS_TYPE_FILESYSTEM); if (pool != NULL) { if (zfs_mount(pool, NULL, 0) == 0) { ret = zfs_share(pool, NULL); zfs_commit_shares(NULL); } zfs_close(pool); } } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) { (void) fprintf(stderr, gettext("pool name may have " "been omitted\n")); } } errout: nvlist_free(nvroot); nvlist_free(fsprops); nvlist_free(props); return (ret); badusage: nvlist_free(fsprops); nvlist_free(props); usage(B_FALSE); return (2); } /* * zpool destroy * * -f Forcefully unmount any datasets * * Destroy the given pool. Automatically unmounts any datasets in the pool. */ int zpool_do_destroy(int argc, char **argv) { boolean_t force = B_FALSE; int c; char *pool; zpool_handle_t *zhp; int ret; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { /* * As a special case, check for use of '/' in the name, and * direct the user to use 'zfs destroy' instead. */ if (strchr(pool, '/') != NULL) (void) fprintf(stderr, gettext("use 'zfs destroy' to " "destroy a dataset\n")); return (1); } if (zpool_disable_datasets(zhp, force) != 0) { (void) fprintf(stderr, gettext("could not destroy '%s': " "could not unmount datasets\n"), zpool_get_name(zhp)); zpool_close(zhp); return (1); } /* The history must be logged as part of the export */ log_history = B_FALSE; ret = (zpool_destroy(zhp, history_str) != 0); zpool_close(zhp); return (ret); } typedef struct export_cbdata { boolean_t force; boolean_t hardforce; } export_cbdata_t; /* * Export one pool */ static int zpool_export_one(zpool_handle_t *zhp, void *data) { export_cbdata_t *cb = data; if (zpool_disable_datasets(zhp, cb->force) != 0) return (1); /* The history must be logged as part of the export */ log_history = B_FALSE; if (cb->hardforce) { if (zpool_export_force(zhp, history_str) != 0) return (1); } else if (zpool_export(zhp, cb->force, history_str) != 0) { return (1); } return (0); } /* * zpool export [-f] ... * * -a Export all pools * -f Forcefully unmount datasets * * Export the given pools. By default, the command will attempt to cleanly * unmount any active datasets within the pool. If the '-f' flag is specified, * then the datasets will be forcefully unmounted. */ int zpool_do_export(int argc, char **argv) { export_cbdata_t cb; boolean_t do_all = B_FALSE; boolean_t force = B_FALSE; boolean_t hardforce = B_FALSE; int c, ret; /* check options */ while ((c = getopt(argc, argv, "afF")) != -1) { switch (c) { case 'a': do_all = B_TRUE; break; case 'f': force = B_TRUE; break; case 'F': hardforce = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } cb.force = force; cb.hardforce = hardforce; argc -= optind; argv += optind; if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } return (for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, zpool_export_one, &cb)); } /* check arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } ret = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, zpool_export_one, &cb); return (ret); } /* * Given a vdev configuration, determine the maximum width needed for the device * name column. */ static int max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max, int name_flags) { static const char *const subtypes[] = {ZPOOL_CONFIG_SPARES, ZPOOL_CONFIG_L2CACHE, ZPOOL_CONFIG_CHILDREN}; char *name = zpool_vdev_name(g_zfs, zhp, nv, name_flags); max = MAX(strlen(name) + depth, max); free(name); nvlist_t **child; uint_t children; for (size_t i = 0; i < ARRAY_SIZE(subtypes); ++i) if (nvlist_lookup_nvlist_array(nv, subtypes[i], &child, &children) == 0) for (uint_t c = 0; c < children; ++c) max = MAX(max_width(zhp, child[c], depth + 2, max, name_flags), max); return (max); } typedef struct spare_cbdata { uint64_t cb_guid; zpool_handle_t *cb_zhp; } spare_cbdata_t; static boolean_t find_vdev(nvlist_t *nv, uint64_t search) { uint64_t guid; nvlist_t **child; uint_t c, children; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && search == guid) return (B_TRUE); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) if (find_vdev(child[c], search)) return (B_TRUE); } return (B_FALSE); } static int find_spare(zpool_handle_t *zhp, void *data) { spare_cbdata_t *cbp = data; nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (find_vdev(nvroot, cbp->cb_guid)) { cbp->cb_zhp = zhp; return (1); } zpool_close(zhp); return (0); } typedef struct status_cbdata { int cb_count; int cb_name_flags; int cb_namewidth; boolean_t cb_allpools; boolean_t cb_verbose; boolean_t cb_literal; boolean_t cb_explain; boolean_t cb_first; boolean_t cb_dedup_stats; boolean_t cb_print_status; boolean_t cb_print_slow_ios; boolean_t cb_print_vdev_init; boolean_t cb_print_vdev_trim; vdev_cmd_data_list_t *vcdl; } status_cbdata_t; /* Return 1 if string is NULL, empty, or whitespace; return 0 otherwise. */ static boolean_t is_blank_str(const char *str) { for (; str != NULL && *str != '\0'; ++str) if (!isblank(*str)) return (B_FALSE); return (B_TRUE); } /* Print command output lines for specific vdev in a specific pool */ static void zpool_print_cmd(vdev_cmd_data_list_t *vcdl, const char *pool, const char *path) { vdev_cmd_data_t *data; int i, j; const char *val; for (i = 0; i < vcdl->count; i++) { if ((strcmp(vcdl->data[i].path, path) != 0) || (strcmp(vcdl->data[i].pool, pool) != 0)) { /* Not the vdev we're looking for */ continue; } data = &vcdl->data[i]; /* Print out all the output values for this vdev */ for (j = 0; j < vcdl->uniq_cols_cnt; j++) { val = NULL; /* Does this vdev have values for this column? */ for (int k = 0; k < data->cols_cnt; k++) { if (strcmp(data->cols[k], vcdl->uniq_cols[j]) == 0) { /* yes it does, record the value */ val = data->lines[k]; break; } } /* * Mark empty values with dashes to make output * awk-able. */ if (val == NULL || is_blank_str(val)) val = "-"; printf("%*s", vcdl->uniq_cols_width[j], val); if (j < vcdl->uniq_cols_cnt - 1) fputs(" ", stdout); } /* Print out any values that aren't in a column at the end */ for (j = data->cols_cnt; j < data->lines_cnt; j++) { /* Did we have any columns? If so print a spacer. */ if (vcdl->uniq_cols_cnt > 0) fputs(" ", stdout); val = data->lines[j]; fputs(val ?: "", stdout); } break; } } /* * Print vdev initialization status for leaves */ static void print_status_initialize(vdev_stat_t *vs, boolean_t verbose) { if (verbose) { if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE || vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED || vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) && !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; struct tm zaction_ts; time_t t = vs->vs_initialize_action_time; int initialize_pct = 100; if (vs->vs_initialize_state != VDEV_INITIALIZE_COMPLETE) { initialize_pct = (vs->vs_initialize_bytes_done * 100 / (vs->vs_initialize_bytes_est + 1)); } (void) localtime_r(&t, &zaction_ts); (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); switch (vs->vs_initialize_state) { case VDEV_INITIALIZE_SUSPENDED: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("suspended, started at"), tbuf); break; case VDEV_INITIALIZE_ACTIVE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("started at"), tbuf); break; case VDEV_INITIALIZE_COMPLETE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("completed at"), tbuf); break; } (void) printf(gettext(" (%d%% initialized%s)"), initialize_pct, zbuf); } else { (void) printf(gettext(" (uninitialized)")); } } else if (vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) printf(gettext(" (initializing)")); } } /* * Print vdev TRIM status for leaves */ static void print_status_trim(vdev_stat_t *vs, boolean_t verbose) { if (verbose) { if ((vs->vs_trim_state == VDEV_TRIM_ACTIVE || vs->vs_trim_state == VDEV_TRIM_SUSPENDED || vs->vs_trim_state == VDEV_TRIM_COMPLETE) && !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; struct tm zaction_ts; time_t t = vs->vs_trim_action_time; int trim_pct = 100; if (vs->vs_trim_state != VDEV_TRIM_COMPLETE) { trim_pct = (vs->vs_trim_bytes_done * 100 / (vs->vs_trim_bytes_est + 1)); } (void) localtime_r(&t, &zaction_ts); (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); switch (vs->vs_trim_state) { case VDEV_TRIM_SUSPENDED: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("suspended, started at"), tbuf); break; case VDEV_TRIM_ACTIVE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("started at"), tbuf); break; case VDEV_TRIM_COMPLETE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("completed at"), tbuf); break; } (void) printf(gettext(" (%d%% trimmed%s)"), trim_pct, zbuf); } else if (vs->vs_trim_notsup) { (void) printf(gettext(" (trim unsupported)")); } else { (void) printf(gettext(" (untrimmed)")); } } else if (vs->vs_trim_state == VDEV_TRIM_ACTIVE) { (void) printf(gettext(" (trimming)")); } } /* * Return the color associated with a health string. This includes returning * NULL for no color change. */ static const char * health_str_to_color(const char *health) { if (strcmp(health, gettext("FAULTED")) == 0 || strcmp(health, gettext("SUSPENDED")) == 0 || strcmp(health, gettext("UNAVAIL")) == 0) { return (ANSI_RED); } if (strcmp(health, gettext("OFFLINE")) == 0 || strcmp(health, gettext("DEGRADED")) == 0 || strcmp(health, gettext("REMOVED")) == 0) { return (ANSI_YELLOW); } return (NULL); } /* * Print out configuration state as requested by status_callback. */ static void print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs) { nvlist_t **child, *root; uint_t c, i, vsc, children; pool_scan_stat_t *ps = NULL; vdev_stat_t *vs; char rbuf[6], wbuf[6], cbuf[6]; char *vname; uint64_t notpresent; spare_cbdata_t spare_cb; const char *state; const char *type; const char *path = NULL; const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) return; state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { /* * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for * online drives. */ if (vs->vs_aux == VDEV_AUX_SPARED) state = gettext("INUSE"); else if (vs->vs_state == VDEV_STATE_HEALTHY) state = gettext("AVAIL"); } printf_color(health_str_to_color(state), "\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, name, state); if (!isspare) { if (vs->vs_read_errors) rcolor = ANSI_RED; if (vs->vs_write_errors) wcolor = ANSI_RED; if (vs->vs_checksum_errors) ccolor = ANSI_RED; if (cb->cb_literal) { fputc(' ', stdout); printf_color(rcolor, "%5llu", (u_longlong_t)vs->vs_read_errors); fputc(' ', stdout); printf_color(wcolor, "%5llu", (u_longlong_t)vs->vs_write_errors); fputc(' ', stdout); printf_color(ccolor, "%5llu", (u_longlong_t)vs->vs_checksum_errors); } else { zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); fputc(' ', stdout); printf_color(rcolor, "%5s", rbuf); fputc(' ', stdout); printf_color(wcolor, "%5s", wbuf); fputc(' ', stdout); printf_color(ccolor, "%5s", cbuf); } if (cb->cb_print_slow_ios) { if (children == 0) { /* Only leafs vdevs have slow IOs */ zfs_nicenum(vs->vs_slow_ios, rbuf, sizeof (rbuf)); } else { snprintf(rbuf, sizeof (rbuf), "-"); } if (cb->cb_literal) printf(" %5llu", (u_longlong_t)vs->vs_slow_ios); else printf(" %5s", rbuf); } } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, ¬present) == 0) { verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); (void) printf(" %s %s", gettext("was"), path); } else if (vs->vs_aux != 0) { (void) printf(" "); color_start(ANSI_RED); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); break; case VDEV_AUX_BAD_GUID_SUM: (void) printf(gettext("missing device")); break; case VDEV_AUX_NO_REPLICAS: (void) printf(gettext("insufficient replicas")); break; case VDEV_AUX_VERSION_NEWER: (void) printf(gettext("newer version")); break; case VDEV_AUX_UNSUP_FEAT: (void) printf(gettext("unsupported feature(s)")); break; case VDEV_AUX_ASHIFT_TOO_BIG: (void) printf(gettext("unsupported minimum blocksize")); break; case VDEV_AUX_SPARED: verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &spare_cb.cb_guid) == 0); if (zpool_iter(g_zfs, find_spare, &spare_cb) == 1) { if (strcmp(zpool_get_name(spare_cb.cb_zhp), zpool_get_name(zhp)) == 0) (void) printf(gettext("currently in " "use")); else (void) printf(gettext("in use by " "pool '%s'"), zpool_get_name(spare_cb.cb_zhp)); zpool_close(spare_cb.cb_zhp); } else { (void) printf(gettext("currently in use")); } break; case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; case VDEV_AUX_IO_FAILURE: (void) printf(gettext("experienced I/O failures")); break; case VDEV_AUX_BAD_LOG: (void) printf(gettext("bad intent log")); break; case VDEV_AUX_EXTERNAL: (void) printf(gettext("external device fault")); break; case VDEV_AUX_SPLIT_POOL: (void) printf(gettext("split into new pool")); break; case VDEV_AUX_ACTIVE: (void) printf(gettext("currently in use")); break; case VDEV_AUX_CHILDREN_OFFLINE: (void) printf(gettext("all children offline")); break; case VDEV_AUX_BAD_LABEL: (void) printf(gettext("invalid label")); break; default: (void) printf(gettext("corrupted data")); break; } color_end(); } else if (children == 0 && !isspare && getenv("ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE") == NULL && VDEV_STAT_VALID(vs_physical_ashift, vsc) && vs->vs_configured_ashift < vs->vs_physical_ashift) { (void) printf( gettext(" block size: %dB configured, %dB native"), 1 << vs->vs_configured_ashift, 1 << vs->vs_physical_ashift); } if (vs->vs_scan_removing != 0) { (void) printf(gettext(" (removing)")); } else if (VDEV_STAT_VALID(vs_noalloc, vsc) && vs->vs_noalloc != 0) { (void) printf(gettext(" (non-allocating)")); } /* The root vdev has the scrub/resilver stats */ root = fnvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(root, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); /* * If you force fault a drive that's resilvering, its scan stats can * get frozen in time, giving the false impression that it's * being resilvered. That's why we check the state to see if the vdev * is healthy before reporting "resilvering" or "repairing". */ if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0 && vs->vs_state == VDEV_STATE_HEALTHY) { if (vs->vs_scan_processed != 0) { (void) printf(gettext(" (%s)"), (ps->pss_func == POOL_SCAN_RESILVER) ? "resilvering" : "repairing"); } else if (vs->vs_resilver_deferred) { (void) printf(gettext(" (awaiting resilver)")); } } /* The top-level vdevs have the rebuild stats */ if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE && children == 0 && vs->vs_state == VDEV_STATE_HEALTHY) { if (vs->vs_rebuild_processed != 0) { (void) printf(gettext(" (resilvering)")); } } if (cb->vcdl != NULL) { if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { printf(" "); zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); } } /* Display vdev initialization and trim status for leaves. */ if (children == 0) { print_status_initialize(vs, cb->cb_print_vdev_init); print_status_trim(vs, cb->cb_print_vdev_trim); } (void) printf("\n"); for (c = 0; c < children; c++) { uint64_t islog = B_FALSE, ishole = B_FALSE; /* Don't print logs or holes here */ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog); (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole); if (islog || ishole) continue; /* Only print normal classes here */ if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; /* Provide vdev_rebuild_stats to children if available */ if (vrs == NULL) { (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i); } vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_status_config(zhp, cb, vname, child[c], depth + 2, isspare, vrs); free(vname); } } /* * Print the configuration of an exported pool. Iterate over all vdevs in the * pool, printing out the name and status for each one. */ static void print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, int depth) { nvlist_t **child; uint_t c, children; vdev_stat_t *vs; const char *type; char *vname; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_MISSING) == 0 || strcmp(type, VDEV_TYPE_HOLE) == 0) return; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); (void) printf("\t%*s%-*s", depth, "", cb->cb_namewidth - depth, name); (void) printf(" %s", zpool_state_to_name(vs->vs_state, vs->vs_aux)); if (vs->vs_aux != 0) { (void) printf(" "); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); break; case VDEV_AUX_BAD_GUID_SUM: (void) printf(gettext("missing device")); break; case VDEV_AUX_NO_REPLICAS: (void) printf(gettext("insufficient replicas")); break; case VDEV_AUX_VERSION_NEWER: (void) printf(gettext("newer version")); break; case VDEV_AUX_UNSUP_FEAT: (void) printf(gettext("unsupported feature(s)")); break; case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; case VDEV_AUX_ACTIVE: (void) printf(gettext("currently in use")); break; case VDEV_AUX_CHILDREN_OFFLINE: (void) printf(gettext("all children offline")); break; case VDEV_AUX_BAD_LABEL: (void) printf(gettext("invalid label")); break; default: (void) printf(gettext("corrupted data")); break; } } (void) printf("\n"); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) continue; if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_import_config(cb, vname, child[c], depth + 2); free(vname); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { (void) printf(gettext("\tcache\n")); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { (void) printf(gettext("\tspares\n")); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags); (void) printf("\t %s\n", vname); free(vname); } } } /* * Print specialized class vdevs. * * These are recorded as top level vdevs in the main pool child array * but with "is_log" set to 1 or an "alloc_bias" string. We use either * print_status_config() or print_import_config() to print the top level * class vdevs then any of their children (eg mirrored slogs) are printed * recursively - which works because only the top level vdev is marked. */ static void print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, const char *class) { uint_t c, children; nvlist_t **child; boolean_t printed = B_FALSE; assert(zhp != NULL || !cb->cb_verbose); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; const char *bias = NULL; const char *type = NULL; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) { bias = (char *)VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class) != 0) continue; if (!is_log && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { (void) printf("\t%s\t\n", gettext(class)); printed = B_TRUE; } char *name = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); if (cb->cb_print_status) print_status_config(zhp, cb, name, child[c], 2, B_FALSE, NULL); else print_import_config(cb, name, child[c], 2); free(name); } } /* * Display the status for the given pool. */ static int show_import(nvlist_t *config, boolean_t report_error) { uint64_t pool_state; vdev_stat_t *vs; const char *name; uint64_t guid; uint64_t hostid = 0; const char *msgid; const char *hostname = "unknown"; nvlist_t *nvroot, *nvinfo; zpool_status_t reason; zpool_errata_t errata; const char *health; uint_t vsc; const char *comment; status_cbdata_t cb = { 0 }; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); health = zpool_state_to_name(vs->vs_state, vs->vs_aux); reason = zpool_import_status(config, &msgid, &errata); /* * If we're importing using a cachefile, then we won't report any * errors unless we are in the scan phase of the import. */ if (reason != ZPOOL_STATUS_OK && !report_error) return (reason); (void) printf(gettext(" pool: %s\n"), name); (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); (void) printf(gettext(" state: %s"), health); if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext(" (DESTROYED)")); (void) printf("\n"); switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "missing from the system.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: case ZPOOL_STATUS_CORRUPT_LABEL_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices contains" " corrupted data.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: (void) printf( gettext(" status: The pool data is corrupted.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices " "are offlined.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool metadata is " "corrupted.\n")); break; case ZPOOL_STATUS_VERSION_OLDER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "a legacy on-disk version.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "an incompatible version.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Some supported " "features are not enabled on the pool.\n\t" "(Note that they may be intentionally disabled " "if the\n\t'compatibility' property is set.)\n")); break; case ZPOOL_STATUS_COMPATIBILITY_ERR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Error reading or parsing " "the file(s) indicated by the 'compatibility'\n" "property.\n")); break; case ZPOOL_STATUS_INCOMPATIBLE_FEAT: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more features " "are enabled on the pool despite not being\n" "requested by the 'compatibility' property.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool uses the following " "feature(s) not supported on this system:\n")); color_start(ANSI_YELLOW); zpool_print_unsup_feat(config); color_end(); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool can only be " "accessed in read-only mode on this system. It\n\tcannot be" " accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); color_start(ANSI_YELLOW); zpool_print_unsup_feat(config); color_end(); break; case ZPOOL_STATUS_HOSTID_ACTIVE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is currently " "imported by another system.\n")); break; case ZPOOL_STATUS_HOSTID_REQUIRED: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool has the " "multihost property on. It cannot\n\tbe safely imported " "when the system hostid is not set.\n")); break; case ZPOOL_STATUS_HOSTID_MISMATCH: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool was last accessed " "by another system.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: case ZPOOL_STATUS_FAULTED_DEV_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted.\n")); break; case ZPOOL_STATUS_BAD_LOG: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("An intent log record cannot " "be read.\n")); break; case ZPOOL_STATUS_RESILVERING: case ZPOOL_STATUS_REBUILDING: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices were " "being resilvered.\n")); break; case ZPOOL_STATUS_ERRATA: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), errata); break; case ZPOOL_STATUS_NON_NATIVE_ASHIFT: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "configured to use a non-native block size.\n" "\tExpect reduced performance.\n")); break; default: /* * No other status can be seen when importing pools. */ assert(reason == ZPOOL_STATUS_OK); } /* * Print out an action according to the overall state of the pool. */ if (vs->vs_state == VDEV_STATE_HEALTHY) { if (reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric identifier, " "though\n\tsome features will not be available " "without an explicit 'zpool upgrade'.\n")); } else if (reason == ZPOOL_STATUS_COMPATIBILITY_ERR) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric\n\tidentifier, " "though the file(s) indicated by its " "'compatibility'\n\tproperty cannot be parsed at " "this time.\n")); } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier and\n\tthe '-f' flag.\n")); } else if (reason == ZPOOL_STATUS_ERRATA) { switch (errata) { case ZPOOL_ERRATA_NONE: break; case ZPOOL_ERRATA_ZOL_2094_SCRUB: (void) printf(gettext(" action: The pool can " "be imported using its name or numeric " "identifier,\n\thowever there is a compat" "ibility issue which should be corrected" "\n\tby running 'zpool scrub'\n")); break; case ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY: (void) printf(gettext(" action: The pool can" "not be imported with this version of ZFS " "due to\n\tan active asynchronous destroy. " "Revert to an earlier version\n\tand " "allow the destroy to complete before " "updating.\n")); break; case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: (void) printf(gettext(" action: Existing " "encrypted datasets contain an on-disk " "incompatibility, which\n\tneeds to be " "corrected. Backup these datasets to new " "encrypted datasets\n\tand destroy the " "old ones.\n")); break; case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION: (void) printf(gettext(" action: Existing " "encrypted snapshots and bookmarks contain " "an on-disk\n\tincompatibility. This may " "cause on-disk corruption if they are used" "\n\twith 'zfs recv'. To correct the " "issue, enable the bookmark_v2 feature.\n\t" "No additional action is needed if there " "are no encrypted snapshots or\n\t" "bookmarks. If preserving the encrypted " "snapshots and bookmarks is\n\trequired, " "use a non-raw send to backup and restore " "them. Alternately,\n\tthey may be removed" " to resolve the incompatibility.\n")); break; default: /* * All errata must contain an action message. */ assert(0); } } else { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier.\n")); } } else if (vs->vs_state == VDEV_STATE_DEGRADED) { (void) printf(gettext(" action: The pool can be imported " "despite missing or damaged devices. The\n\tfault " "tolerance of the pool may be compromised if imported.\n")); } else { switch (reason) { case ZPOOL_STATUS_VERSION_NEWER: (void) printf(gettext(" action: The pool cannot be " "imported. Access the pool on a system running " "newer\n\tsoftware, or recreate the pool from " "backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be " "imported. Access the pool on a system that " "supports\n\tthe required feature(s), or recreate " "the pool from backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be " "imported in read-write mode. Import the pool " "with\n" "\t\"-o readonly=on\", access the pool on a system " "that supports the\n\trequired feature(s), or " "recreate the pool from backup.\n")); break; case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: (void) printf(gettext(" action: The pool cannot be " "imported. Attach the missing\n\tdevices and try " "again.\n")); break; case ZPOOL_STATUS_HOSTID_ACTIVE: VERIFY0(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo)); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); (void) printf(gettext(" action: The pool must be " "exported from %s (hostid=%"PRIx64")\n\tbefore it " "can be safely imported.\n"), hostname, hostid); break; case ZPOOL_STATUS_HOSTID_REQUIRED: (void) printf(gettext(" action: Set a unique system " "hostid with the zgenhostid(8) command.\n")); break; default: (void) printf(gettext(" action: The pool cannot be " "imported due to damaged devices or data.\n")); } } /* Print the comment attached to the pool. */ if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) (void) printf(gettext("comment: %s\n"), comment); /* * If the state is "closed" or "can't open", and the aux state * is "corrupt data": */ if (((vs->vs_state == VDEV_STATE_CLOSED) || (vs->vs_state == VDEV_STATE_CANT_OPEN)) && (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) { if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext("\tThe pool was destroyed, " "but can be imported using the '-Df' flags.\n")); else if (pool_state != POOL_STATE_EXPORTED) (void) printf(gettext("\tThe pool may be active on " "another system, but can be imported using\n\t" "the '-f' flag.\n")); } if (msgid != NULL) { (void) printf(gettext( " see: https://openzfs.github.io/openzfs-docs/msg/%s\n"), msgid); } (void) printf(gettext(" config:\n\n")); cb.cb_namewidth = max_width(NULL, nvroot, 0, strlen(name), VDEV_NAME_TYPE_ID); if (cb.cb_namewidth < 10) cb.cb_namewidth = 10; print_import_config(&cb, name, nvroot, 0); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_DEDUP); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_SPECIAL); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS); if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { (void) printf(gettext("\n\tAdditional devices are known to " "be part of this pool, though their\n\texact " "configuration cannot be determined.\n")); } return (0); } static boolean_t zfs_force_import_required(nvlist_t *config) { uint64_t state; uint64_t hostid = 0; nvlist_t *nvinfo; state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE); - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); + nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); + + /* + * The hostid on LOAD_INFO comes from the MOS label via + * spa_tryimport(). If its not there then we're likely talking to an + * older kernel, so use the top one, which will be from the label + * discovered in zpool_find_import(), or if a cachefile is in use, the + * local hostid. + */ + if (nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_HOSTID, &hostid) != 0) + nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid()) return (B_TRUE); - nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) { mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); if (mmp_state != MMP_STATE_INACTIVE) return (B_TRUE); } return (B_FALSE); } /* * Perform the import for the given configuration. This passes the heavy * lifting off to zpool_import_props(), and then mounts the datasets contained * within the pool. */ static int do_import(nvlist_t *config, const char *newname, const char *mntopts, nvlist_t *props, int flags) { int ret = 0; int ms_status = 0; zpool_handle_t *zhp; const char *name; uint64_t version; name = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME); version = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION); if (!SPA_VERSION_IS_SUPPORTED(version)) { (void) fprintf(stderr, gettext("cannot import '%s': pool " "is formatted using an unsupported ZFS version\n"), name); return (1); } else if (zfs_force_import_required(config) && !(flags & ZFS_IMPORT_ANY_HOST)) { mmp_state_t mmp_state = MMP_STATE_INACTIVE; nvlist_t *nvinfo; nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); if (mmp_state == MMP_STATE_ACTIVE) { const char *hostname = ""; uint64_t hostid = 0; if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); (void) fprintf(stderr, gettext("cannot import '%s': " "pool is imported on %s (hostid: " "0x%"PRIx64")\nExport the pool on the other " "system, then run 'zpool import'.\n"), name, hostname, hostid); } else if (mmp_state == MMP_STATE_NO_HOSTID) { (void) fprintf(stderr, gettext("Cannot import '%s': " "pool has the multihost property on and the\n" "system's hostid is not set. Set a unique hostid " "with the zgenhostid(8) command.\n"), name); } else { const char *hostname = ""; time_t timestamp = 0; uint64_t hostid = 0; - if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTNAME)) + hostname = fnvlist_lookup_string(nvinfo, + ZPOOL_CONFIG_HOSTNAME); + else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) hostname = fnvlist_lookup_string(config, ZPOOL_CONFIG_HOSTNAME); if (nvlist_exists(config, ZPOOL_CONFIG_TIMESTAMP)) timestamp = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP); - if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTID)) + hostid = fnvlist_lookup_uint64(nvinfo, + ZPOOL_CONFIG_HOSTID); + else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID); (void) fprintf(stderr, gettext("cannot import '%s': " "pool was previously in use from another system.\n" "Last accessed by %s (hostid=%"PRIx64") at %s" "The pool can be imported, use 'zpool import -f' " "to import the pool.\n"), name, hostname, hostid, ctime(×tamp)); } return (1); } if (zpool_import_props(g_zfs, config, newname, props, flags) != 0) return (1); if (newname != NULL) name = newname; if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL) return (1); /* * Loading keys is best effort. We don't want to return immediately * if it fails but we do want to give the error to the caller. */ if (flags & ZFS_IMPORT_LOAD_KEYS && zfs_crypto_attempt_load_keys(g_zfs, name) != 0) ret = 1; if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && !(flags & ZFS_IMPORT_ONLY)) { ms_status = zpool_enable_datasets(zhp, mntopts, 0); if (ms_status == EZFS_SHAREFAILED) { (void) fprintf(stderr, gettext("Import was " "successful, but unable to share some datasets")); } else if (ms_status == EZFS_MOUNTFAILED) { (void) fprintf(stderr, gettext("Import was " "successful, but unable to mount some datasets")); } } zpool_close(zhp); return (ret); } static int import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, char *orig_name, char *new_name, boolean_t do_destroyed, boolean_t pool_specified, boolean_t do_all, importargs_t *import) { nvlist_t *config = NULL; nvlist_t *found_config = NULL; uint64_t pool_state; /* * At this point we have a list of import candidate configs. Even if * we were searching by pool name or guid, we still need to * post-process the list to deal with pool state and possible * duplicate names. */ int err = 0; nvpair_t *elem = NULL; boolean_t first = B_TRUE; while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { verify(nvpair_value_nvlist(elem, &config) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) continue; if (do_destroyed && pool_state != POOL_STATE_DESTROYED) continue; verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, import->policy) == 0); if (!pool_specified) { if (first) first = B_FALSE; else if (!do_all) (void) fputc('\n', stdout); if (do_all) { err |= do_import(config, NULL, mntopts, props, flags); } else { /* * If we're importing from cachefile, then * we don't want to report errors until we * are in the scan phase of the import. If * we get an error, then we return that error * to invoke the scan phase. */ if (import->cachefile && !import->scan) err = show_import(config, B_FALSE); else (void) show_import(config, B_TRUE); } } else if (import->poolname != NULL) { const char *name; /* * We are searching for a pool based on name. */ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); if (strcmp(name, import->poolname) == 0) { if (found_config != NULL) { (void) fprintf(stderr, gettext( "cannot import '%s': more than " "one matching pool\n"), import->poolname); (void) fprintf(stderr, gettext( "import by numeric ID instead\n")); err = B_TRUE; } found_config = config; } } else { uint64_t guid; /* * Search for a pool by guid. */ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); if (guid == import->guid) found_config = config; } } /* * If we were searching for a specific pool, verify that we found a * pool, and then do the import. */ if (pool_specified && err == 0) { if (found_config == NULL) { (void) fprintf(stderr, gettext("cannot import '%s': " "no such pool available\n"), orig_name); err = B_TRUE; } else { err |= do_import(found_config, new_name, mntopts, props, flags); } } /* * If we were just looking for pools, report an error if none were * found. */ if (!pool_specified && first) (void) fprintf(stderr, gettext("no pools available to import\n")); return (err); } typedef struct target_exists_args { const char *poolname; uint64_t poolguid; } target_exists_args_t; static int name_or_guid_exists(zpool_handle_t *zhp, void *data) { target_exists_args_t *args = data; nvlist_t *config = zpool_get_config(zhp, NULL); int found = 0; if (config == NULL) return (0); if (args->poolname != NULL) { const char *pool_name; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0); if (strcmp(pool_name, args->poolname) == 0) found = 1; } else { uint64_t pool_guid; verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0); if (pool_guid == args->poolguid) found = 1; } zpool_close(zhp); return (found); } /* * zpool checkpoint * checkpoint --discard * * -d Discard the checkpoint from a checkpointed * --discard pool. * * -w Wait for discarding a checkpoint to complete. * --wait * * Checkpoints the specified pool, by taking a "snapshot" of its * current state. A pool can only have one checkpoint at a time. */ int zpool_do_checkpoint(int argc, char **argv) { boolean_t discard, wait; char *pool; zpool_handle_t *zhp; int c, err; struct option long_options[] = { {"discard", no_argument, NULL, 'd'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; discard = B_FALSE; wait = B_FALSE; while ((c = getopt_long(argc, argv, ":dw", long_options, NULL)) != -1) { switch (c) { case 'd': discard = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (wait && !discard) { (void) fprintf(stderr, gettext("--wait only valid when " "--discard also specified\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open(g_zfs, pool)) == NULL) { /* As a special case, check for use of '/' in the name */ if (strchr(pool, '/') != NULL) (void) fprintf(stderr, gettext("'zpool checkpoint' " "doesn't work on datasets. To save the state " "of a dataset from a specific point in time " "please use 'zfs snapshot'\n")); return (1); } if (discard) { err = (zpool_discard_checkpoint(zhp) != 0); if (err == 0 && wait) err = zpool_wait(zhp, ZPOOL_WAIT_CKPT_DISCARD); } else { err = (zpool_checkpoint(zhp) != 0); } zpool_close(zhp); return (err); } #define CHECKPOINT_OPT 1024 /* * zpool import [-d dir] [-D] * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] * [-d dir | -c cachefile | -s] [-f] -a * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] * [-d dir | -c cachefile | -s] [-f] [-n] [-F] * [newpool] * * -c Read pool information from a cachefile instead of searching * devices. If importing from a cachefile config fails, then * fallback to searching for devices only in the directories that * exist in the cachefile. * * -d Scan in a specific directory, other than /dev/. More than * one directory can be specified using multiple '-d' options. * * -D Scan for previously destroyed pools or import all or only * specified destroyed pools. * * -R Temporarily import the pool, with all mountpoints relative to * the given root. The pool will remain exported when the machine * is rebooted. * * -V Import even in the presence of faulted vdevs. This is an * intentionally undocumented option for testing purposes, and * treats the pool configuration as complete, leaving any bad * vdevs in the FAULTED state. In other words, it does verbatim * import. * * -f Force import, even if it appears that the pool is active. * * -F Attempt rewind if necessary. * * -n See if rewind would work, but don't actually rewind. * * -N Import the pool but don't mount datasets. * * -T Specify a starting txg to use for import. This option is * intentionally undocumented option for testing purposes. * * -a Import all pools found. * * -l Load encryption keys while importing. * * -o Set property=value and/or temporary mount options (without '='). * * -s Scan using the default search path, the libblkid cache will * not be consulted. * * --rewind-to-checkpoint * Import the pool and revert back to the checkpoint. * * The import command scans for pools to import, and import pools based on pool * name and GUID. The pool can also be renamed as part of the import process. */ int zpool_do_import(int argc, char **argv) { char **searchdirs = NULL; char *env, *envdup = NULL; int nsearch = 0; int c; int err = 0; nvlist_t *pools = NULL; boolean_t do_all = B_FALSE; boolean_t do_destroyed = B_FALSE; char *mntopts = NULL; uint64_t searchguid = 0; char *searchname = NULL; char *propval; nvlist_t *policy = NULL; nvlist_t *props = NULL; int flags = ZFS_IMPORT_NORMAL; uint32_t rewind_policy = ZPOOL_NO_REWIND; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; boolean_t do_scan = B_FALSE; boolean_t pool_exists = B_FALSE; boolean_t pool_specified = B_FALSE; uint64_t txg = -1ULL; char *cachefile = NULL; importargs_t idata = { 0 }; char *endptr; struct option long_options[] = { {"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, ":aCc:d:DEfFlmnNo:R:stT:VX", long_options, NULL)) != -1) { switch (c) { case 'a': do_all = B_TRUE; break; case 'c': cachefile = optarg; break; case 'd': searchdirs = safe_realloc(searchdirs, (nsearch + 1) * sizeof (char *)); searchdirs[nsearch++] = optarg; break; case 'D': do_destroyed = B_TRUE; break; case 'f': flags |= ZFS_IMPORT_ANY_HOST; break; case 'F': do_rewind = B_TRUE; break; case 'l': flags |= ZFS_IMPORT_LOAD_KEYS; break; case 'm': flags |= ZFS_IMPORT_MISSING_LOG; break; case 'n': dryrun = B_TRUE; break; case 'N': flags |= ZFS_IMPORT_ONLY; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE)) goto error; } else { mntopts = optarg; } break; case 'R': if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) goto error; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props)) goto error; break; case 's': do_scan = B_TRUE; break; case 't': flags |= ZFS_IMPORT_TEMP_NAME; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props)) goto error; break; case 'T': errno = 0; txg = strtoull(optarg, &endptr, 0); if (errno != 0 || *endptr != '\0') { (void) fprintf(stderr, gettext("invalid txg value\n")); usage(B_FALSE); } rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND; break; case 'V': flags |= ZFS_IMPORT_VERBATIM; break; case 'X': xtreme_rewind = B_TRUE; break; case CHECKPOINT_OPT: flags |= ZFS_IMPORT_CHECKPOINT; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (cachefile && nsearch != 0) { (void) fprintf(stderr, gettext("-c is incompatible with -d\n")); usage(B_FALSE); } if (cachefile && do_scan) { (void) fprintf(stderr, gettext("-c is incompatible with -s\n")); usage(B_FALSE); } if ((flags & ZFS_IMPORT_LOAD_KEYS) && (flags & ZFS_IMPORT_ONLY)) { (void) fprintf(stderr, gettext("-l is incompatible with -N\n")); usage(B_FALSE); } if ((flags & ZFS_IMPORT_LOAD_KEYS) && !do_all && argc == 0) { (void) fprintf(stderr, gettext("-l is only meaningful during " "an import\n")); usage(B_FALSE); } if ((dryrun || xtreme_rewind) && !do_rewind) { (void) fprintf(stderr, gettext("-n or -X only meaningful with -F\n")); usage(B_FALSE); } if (dryrun) rewind_policy = ZPOOL_TRY_REWIND; else if (do_rewind) rewind_policy = ZPOOL_DO_REWIND; if (xtreme_rewind) rewind_policy |= ZPOOL_EXTREME_REWIND; /* In the future, we can capture further policy and include it here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, txg) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind_policy) != 0) goto error; /* check argument count */ if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } else { if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } /* * Check for the effective uid. We do this explicitly here because * otherwise any attempt to discover pools will silently fail. */ if (argc == 0 && geteuid() != 0) { (void) fprintf(stderr, gettext("cannot " "discover pools: permission denied\n")); free(searchdirs); nvlist_free(props); nvlist_free(policy); return (1); } /* * Depending on the arguments given, we do one of the following: * * Iterate through all pools and display information about * each one. * * -a Iterate through all pools and try to import each one. * * Find the pool that corresponds to the given GUID/pool * name and import that one. * * -D Above options applies only to destroyed pools. */ if (argc != 0) { char *endptr; errno = 0; searchguid = strtoull(argv[0], &endptr, 10); if (errno != 0 || *endptr != '\0') { searchname = argv[0]; searchguid = 0; } pool_specified = B_TRUE; /* * User specified a name or guid. Ensure it's unique. */ target_exists_args_t search = {searchname, searchguid}; pool_exists = zpool_iter(g_zfs, name_or_guid_exists, &search); } /* * Check the environment for the preferred search path. */ if ((searchdirs == NULL) && (env = getenv("ZPOOL_IMPORT_PATH"))) { char *dir, *tmp = NULL; envdup = strdup(env); for (dir = strtok_r(envdup, ":", &tmp); dir != NULL; dir = strtok_r(NULL, ":", &tmp)) { searchdirs = safe_realloc(searchdirs, (nsearch + 1) * sizeof (char *)); searchdirs[nsearch++] = dir; } } idata.path = searchdirs; idata.paths = nsearch; idata.poolname = searchname; idata.guid = searchguid; idata.cachefile = cachefile; idata.scan = do_scan; idata.policy = policy; libpc_handle_t lpch = { .lpc_lib_handle = g_zfs, .lpc_ops = &libzfs_config_ops, .lpc_printerr = B_TRUE }; pools = zpool_search_import(&lpch, &idata); if (pools != NULL && pool_exists && (argc == 1 || strcmp(argv[0], argv[1]) == 0)) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name already exists\n"), argv[0]); (void) fprintf(stderr, gettext("use the form '%s " " ' to give it a new name\n"), "zpool import"); err = 1; } else if (pools == NULL && pool_exists) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name is already created/imported,\n"), argv[0]); (void) fprintf(stderr, gettext("and no additional pools " "with that name were found\n")); err = 1; } else if (pools == NULL) { if (argc != 0) { (void) fprintf(stderr, gettext("cannot import '%s': " "no such pool available\n"), argv[0]); } err = 1; } if (err == 1) { free(searchdirs); free(envdup); nvlist_free(policy); nvlist_free(pools); nvlist_free(props); return (1); } err = import_pools(pools, props, mntopts, flags, argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, do_destroyed, pool_specified, do_all, &idata); /* * If we're using the cachefile and we failed to import, then * fallback to scanning the directory for pools that match * those in the cachefile. */ if (err != 0 && cachefile != NULL) { (void) printf(gettext("cachefile import failed, retrying\n")); /* * We use the scan flag to gather the directories that exist * in the cachefile. If we need to fallback to searching for * the pool config, we will only search devices in these * directories. */ idata.scan = B_TRUE; nvlist_free(pools); pools = zpool_search_import(&lpch, &idata); err = import_pools(pools, props, mntopts, flags, argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, do_destroyed, pool_specified, do_all, &idata); } error: nvlist_free(props); nvlist_free(pools); nvlist_free(policy); free(searchdirs); free(envdup); return (err ? 1 : 0); } /* * zpool sync [-f] [pool] ... * * -f (undocumented) force uberblock (and config including zpool cache file) * update. * * Sync the specified pool(s). * Without arguments "zpool sync" will sync all pools. * This command initiates TXG sync(s) and will return after the TXG(s) commit. * */ static int zpool_do_sync(int argc, char **argv) { int ret; boolean_t force = B_FALSE; /* check options */ while ((ret = getopt(argc, argv, "f")) != -1) { switch (ret) { case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* if argc == 0 we will execute zpool_sync_one on all pools */ ret = for_each_pool(argc, argv, B_FALSE, NULL, ZFS_TYPE_POOL, B_FALSE, zpool_sync_one, &force); return (ret); } typedef struct iostat_cbdata { uint64_t cb_flags; int cb_namewidth; int cb_iteration; boolean_t cb_verbose; boolean_t cb_literal; boolean_t cb_scripted; zpool_list_t *cb_list; vdev_cmd_data_list_t *vcdl; vdev_cbdata_t cb_vdevs; } iostat_cbdata_t; /* iostat labels */ typedef struct name_and_columns { const char *name; /* Column name */ unsigned int columns; /* Center name to this number of columns */ } name_and_columns_t; #define IOSTAT_MAX_LABELS 15 /* Max number of labels on one line */ static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] = { [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2}, {NULL}}, [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, {"asyncq_wait", 2}, {"scrub", 1}, {"trim", 1}, {"rebuild", 1}, {NULL}}, [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2}, {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2}, {"trimq_write", 2}, {"rebuildq_write", 2}, {NULL}}, [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, {"asyncq_wait", 2}, {NULL}}, [IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2}, {"async_read", 2}, {"async_write", 2}, {"scrub", 2}, {"trim", 2}, {"rebuild", 2}, {NULL}}, }; /* Shorthand - if "columns" field not set, default to 1 column */ static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] = { [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"}, {"write"}, {NULL}}, [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"wait"}, {"wait"}, {"wait"}, {NULL}}, [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {NULL}}, [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"scrub"}, {"trim"}, {"rebuild"}, {NULL}}, [IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}}, }; static const char *histo_to_title[] = { [IOS_L_HISTO] = "latency", [IOS_RQ_HISTO] = "req_size", }; /* * Return the number of labels in a null-terminated name_and_columns_t * array. * */ static unsigned int label_array_len(const name_and_columns_t *labels) { int i = 0; while (labels[i].name) i++; return (i); } /* * Return the number of strings in a null-terminated string array. * For example: * * const char foo[] = {"bar", "baz", NULL} * * returns 2 */ static uint64_t str_array_len(const char *array[]) { uint64_t i = 0; while (array[i]) i++; return (i); } /* * Return a default column width for default/latency/queue columns. This does * not include histograms, which have their columns autosized. */ static unsigned int default_column_width(iostat_cbdata_t *cb, enum iostat_type type) { unsigned long column_width = 5; /* Normal niceprint */ static unsigned long widths[] = { /* * Choose some sane default column sizes for printing the * raw numbers. */ [IOS_DEFAULT] = 15, /* 1PB capacity */ [IOS_LATENCY] = 10, /* 1B ns = 10sec */ [IOS_QUEUES] = 6, /* 1M queue entries */ [IOS_L_HISTO] = 10, /* 1B ns = 10sec */ [IOS_RQ_HISTO] = 6, /* 1M queue entries */ }; if (cb->cb_literal) column_width = widths[type]; return (column_width); } /* * Print the column labels, i.e: * * capacity operations bandwidth * alloc free read write read write ... * * If force_column_width is set, use it for the column width. If not set, use * the default column width. */ static void print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width, const name_and_columns_t labels[][IOSTAT_MAX_LABELS]) { int i, idx, s; int text_start, rw_column_width, spaces_to_end; uint64_t flags = cb->cb_flags; uint64_t f; unsigned int column_width = force_column_width; /* For each bit set in flags */ for (f = flags; f; f &= ~(1ULL << idx)) { idx = lowbit64(f) - 1; if (!force_column_width) column_width = default_column_width(cb, idx); /* Print our top labels centered over "read write" label. */ for (i = 0; i < label_array_len(labels[idx]); i++) { const char *name = labels[idx][i].name; /* * We treat labels[][].columns == 0 as shorthand * for one column. It makes writing out the label * tables more concise. */ unsigned int columns = MAX(1, labels[idx][i].columns); unsigned int slen = strlen(name); rw_column_width = (column_width * columns) + (2 * (columns - 1)); text_start = (int)((rw_column_width) / columns - slen / columns); if (text_start < 0) text_start = 0; printf(" "); /* Two spaces between columns */ /* Space from beginning of column to label */ for (s = 0; s < text_start; s++) printf(" "); printf("%s", name); /* Print space after label to end of column */ spaces_to_end = rw_column_width - text_start - slen; if (spaces_to_end < 0) spaces_to_end = 0; for (s = 0; s < spaces_to_end; s++) printf(" "); } } } /* * print_cmd_columns - Print custom column titles from -c * * If the user specified the "zpool status|iostat -c" then print their custom * column titles in the header. For example, print_cmd_columns() would print * the " col1 col2" part of this: * * $ zpool iostat -vc 'echo col1=val1; echo col2=val2' * ... * capacity operations bandwidth * pool alloc free read write read write col1 col2 * ---------- ----- ----- ----- ----- ----- ----- ---- ---- * mypool 269K 1008M 0 0 107 946 * mirror 269K 1008M 0 0 107 946 * sdb - - 0 0 102 473 val1 val2 * sdc - - 0 0 5 473 val1 val2 * ---------- ----- ----- ----- ----- ----- ----- ---- ---- */ static void print_cmd_columns(vdev_cmd_data_list_t *vcdl, int use_dashes) { int i, j; vdev_cmd_data_t *data = &vcdl->data[0]; if (vcdl->count == 0 || data == NULL) return; /* * Each vdev cmd should have the same column names unless the user did * something weird with their cmd. Just take the column names from the * first vdev and assume it works for all of them. */ for (i = 0; i < vcdl->uniq_cols_cnt; i++) { printf(" "); if (use_dashes) { for (j = 0; j < vcdl->uniq_cols_width[i]; j++) printf("-"); } else { printf_color(ANSI_BOLD, "%*s", vcdl->uniq_cols_width[i], vcdl->uniq_cols[i]); } } } /* * Utility function to print out a line of dashes like: * * -------------------------------- ----- ----- ----- ----- ----- * * ...or a dashed named-row line like: * * logs - - - - - * * @cb: iostat data * * @force_column_width If non-zero, use the value as the column width. * Otherwise use the default column widths. * * @name: Print a dashed named-row line starting * with @name. Otherwise, print a regular * dashed line. */ static void print_iostat_dashes(iostat_cbdata_t *cb, unsigned int force_column_width, const char *name) { int i; unsigned int namewidth; uint64_t flags = cb->cb_flags; uint64_t f; int idx; const name_and_columns_t *labels; const char *title; if (cb->cb_flags & IOS_ANYHISTO_M) { title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; } else if (cb->cb_vdevs.cb_names_count) { title = "vdev"; } else { title = "pool"; } namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), name ? strlen(name) : 0); if (name) { printf("%-*s", namewidth, name); } else { for (i = 0; i < namewidth; i++) (void) printf("-"); } /* For each bit in flags */ for (f = flags; f; f &= ~(1ULL << idx)) { unsigned int column_width; idx = lowbit64(f) - 1; if (force_column_width) column_width = force_column_width; else column_width = default_column_width(cb, idx); labels = iostat_bottom_labels[idx]; for (i = 0; i < label_array_len(labels); i++) { if (name) printf(" %*s-", column_width - 1, " "); else printf(" %.*s", column_width, "--------------------"); } } } static void print_iostat_separator_impl(iostat_cbdata_t *cb, unsigned int force_column_width) { print_iostat_dashes(cb, force_column_width, NULL); } static void print_iostat_separator(iostat_cbdata_t *cb) { print_iostat_separator_impl(cb, 0); } static void print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width, const char *histo_vdev_name) { unsigned int namewidth; const char *title; color_start(ANSI_BOLD); if (cb->cb_flags & IOS_ANYHISTO_M) { title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; } else if (cb->cb_vdevs.cb_names_count) { title = "vdev"; } else { title = "pool"; } namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), histo_vdev_name ? strlen(histo_vdev_name) : 0); if (histo_vdev_name) printf("%-*s", namewidth, histo_vdev_name); else printf("%*s", namewidth, ""); print_iostat_labels(cb, force_column_width, iostat_top_labels); printf("\n"); printf("%-*s", namewidth, title); print_iostat_labels(cb, force_column_width, iostat_bottom_labels); if (cb->vcdl != NULL) print_cmd_columns(cb->vcdl, 0); printf("\n"); print_iostat_separator_impl(cb, force_column_width); if (cb->vcdl != NULL) print_cmd_columns(cb->vcdl, 1); color_end(); printf("\n"); } static void print_iostat_header(iostat_cbdata_t *cb) { print_iostat_header_impl(cb, 0, NULL); } /* * Prints a size string (i.e. 120M) with the suffix ("M") colored * by order of magnitude. Uses column_size to add padding. */ static void print_stat_color(const char *statbuf, unsigned int column_size) { fputs(" ", stdout); size_t len = strlen(statbuf); while (len < column_size) { fputc(' ', stdout); column_size--; } if (*statbuf == '0') { color_start(ANSI_GRAY); fputc('0', stdout); } else { for (; *statbuf; statbuf++) { if (*statbuf == 'K') color_start(ANSI_GREEN); else if (*statbuf == 'M') color_start(ANSI_YELLOW); else if (*statbuf == 'G') color_start(ANSI_RED); else if (*statbuf == 'T') color_start(ANSI_BOLD_BLUE); else if (*statbuf == 'P') color_start(ANSI_MAGENTA); else if (*statbuf == 'E') color_start(ANSI_CYAN); fputc(*statbuf, stdout); if (--column_size <= 0) break; } } color_end(); } /* * Display a single statistic. */ static void print_one_stat(uint64_t value, enum zfs_nicenum_format format, unsigned int column_size, boolean_t scripted) { char buf[64]; zfs_nicenum_format(value, buf, sizeof (buf), format); if (scripted) printf("\t%s", buf); else print_stat_color(buf, column_size); } /* * Calculate the default vdev stats * * Subtract oldvs from newvs, apply a scaling factor, and save the resulting * stats into calcvs. */ static void calc_default_iostats(vdev_stat_t *oldvs, vdev_stat_t *newvs, vdev_stat_t *calcvs) { int i; memcpy(calcvs, newvs, sizeof (*calcvs)); for (i = 0; i < ARRAY_SIZE(calcvs->vs_ops); i++) calcvs->vs_ops[i] = (newvs->vs_ops[i] - oldvs->vs_ops[i]); for (i = 0; i < ARRAY_SIZE(calcvs->vs_bytes); i++) calcvs->vs_bytes[i] = (newvs->vs_bytes[i] - oldvs->vs_bytes[i]); } /* * Internal representation of the extended iostats data. * * The extended iostat stats are exported in nvlists as either uint64_t arrays * or single uint64_t's. We make both look like arrays to make them easier * to process. In order to make single uint64_t's look like arrays, we set * __data to the stat data, and then set *data = &__data with count = 1. Then, * we can just use *data and count. */ struct stat_array { uint64_t *data; uint_t count; /* Number of entries in data[] */ uint64_t __data; /* Only used when data is a single uint64_t */ }; static uint64_t stat_histo_max(struct stat_array *nva, unsigned int len) { uint64_t max = 0; int i; for (i = 0; i < len; i++) max = MAX(max, array64_max(nva[i].data, nva[i].count)); return (max); } /* * Helper function to lookup a uint64_t array or uint64_t value and store its * data as a stat_array. If the nvpair is a single uint64_t value, then we make * it look like a one element array to make it easier to process. */ static int nvpair64_to_stat_array(nvlist_t *nvl, const char *name, struct stat_array *nva) { nvpair_t *tmp; int ret; verify(nvlist_lookup_nvpair(nvl, name, &tmp) == 0); switch (nvpair_type(tmp)) { case DATA_TYPE_UINT64_ARRAY: ret = nvpair_value_uint64_array(tmp, &nva->data, &nva->count); break; case DATA_TYPE_UINT64: ret = nvpair_value_uint64(tmp, &nva->__data); nva->data = &nva->__data; nva->count = 1; break; default: /* Not a uint64_t */ ret = EINVAL; break; } return (ret); } /* * Given a list of nvlist names, look up the extended stats in newnv and oldnv, * subtract them, and return the results in a newly allocated stat_array. * You must free the returned array after you are done with it with * free_calc_stats(). * * Additionally, you can set "oldnv" to NULL if you simply want the newnv * values. */ static struct stat_array * calc_and_alloc_stats_ex(const char **names, unsigned int len, nvlist_t *oldnv, nvlist_t *newnv) { nvlist_t *oldnvx = NULL, *newnvx; struct stat_array *oldnva, *newnva, *calcnva; int i, j; unsigned int alloc_size = (sizeof (struct stat_array)) * len; /* Extract our extended stats nvlist from the main list */ verify(nvlist_lookup_nvlist(newnv, ZPOOL_CONFIG_VDEV_STATS_EX, &newnvx) == 0); if (oldnv) { verify(nvlist_lookup_nvlist(oldnv, ZPOOL_CONFIG_VDEV_STATS_EX, &oldnvx) == 0); } newnva = safe_malloc(alloc_size); oldnva = safe_malloc(alloc_size); calcnva = safe_malloc(alloc_size); for (j = 0; j < len; j++) { verify(nvpair64_to_stat_array(newnvx, names[j], &newnva[j]) == 0); calcnva[j].count = newnva[j].count; alloc_size = calcnva[j].count * sizeof (calcnva[j].data[0]); calcnva[j].data = safe_malloc(alloc_size); memcpy(calcnva[j].data, newnva[j].data, alloc_size); if (oldnvx) { verify(nvpair64_to_stat_array(oldnvx, names[j], &oldnva[j]) == 0); for (i = 0; i < oldnva[j].count; i++) calcnva[j].data[i] -= oldnva[j].data[i]; } } free(newnva); free(oldnva); return (calcnva); } static void free_calc_stats(struct stat_array *nva, unsigned int len) { int i; for (i = 0; i < len; i++) free(nva[i].data); free(nva); } static void print_iostat_histo(struct stat_array *nva, unsigned int len, iostat_cbdata_t *cb, unsigned int column_width, unsigned int namewidth, double scale) { int i, j; char buf[6]; uint64_t val; enum zfs_nicenum_format format; unsigned int buckets; unsigned int start_bucket; if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; /* All these histos are the same size, so just use nva[0].count */ buckets = nva[0].count; if (cb->cb_flags & IOS_RQ_HISTO_M) { /* Start at 512 - req size should never be lower than this */ start_bucket = 9; } else { start_bucket = 0; } for (j = start_bucket; j < buckets; j++) { /* Print histogram bucket label */ if (cb->cb_flags & IOS_L_HISTO_M) { /* Ending range of this bucket */ val = (1UL << (j + 1)) - 1; zfs_nicetime(val, buf, sizeof (buf)); } else { /* Request size (starting range of bucket) */ val = (1UL << j); zfs_nicenum(val, buf, sizeof (buf)); } if (cb->cb_scripted) printf("%llu", (u_longlong_t)val); else printf("%-*s", namewidth, buf); /* Print the values on the line */ for (i = 0; i < len; i++) { print_one_stat(nva[i].data[j] * scale, format, column_width, cb->cb_scripted); } printf("\n"); } } static void print_solid_separator(unsigned int length) { while (length--) printf("-"); printf("\n"); } static void print_iostat_histos(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv, double scale, const char *name) { unsigned int column_width; unsigned int namewidth; unsigned int entire_width; enum iostat_type type; struct stat_array *nva; const char **names; unsigned int names_len; /* What type of histo are we? */ type = IOS_HISTO_IDX(cb->cb_flags); /* Get NULL-terminated array of nvlist names for our histo */ names = vsx_type_to_nvlist[type]; names_len = str_array_len(names); /* num of names */ nva = calc_and_alloc_stats_ex(names, names_len, oldnv, newnv); if (cb->cb_literal) { column_width = MAX(5, (unsigned int) log10(stat_histo_max(nva, names_len)) + 1); } else { column_width = 5; } namewidth = MAX(cb->cb_namewidth, strlen(histo_to_title[IOS_HISTO_IDX(cb->cb_flags)])); /* * Calculate the entire line width of what we're printing. The * +2 is for the two spaces between columns: */ /* read write */ /* ----- ----- */ /* |___| <---------- column_width */ /* */ /* |__________| <--- entire_width */ /* */ entire_width = namewidth + (column_width + 2) * label_array_len(iostat_bottom_labels[type]); if (cb->cb_scripted) printf("%s\n", name); else print_iostat_header_impl(cb, column_width, name); print_iostat_histo(nva, names_len, cb, column_width, namewidth, scale); free_calc_stats(nva, names_len); if (!cb->cb_scripted) print_solid_separator(entire_width); } /* * Calculate the average latency of a power-of-two latency histogram */ static uint64_t single_histo_average(uint64_t *histo, unsigned int buckets) { int i; uint64_t count = 0, total = 0; for (i = 0; i < buckets; i++) { /* * Our buckets are power-of-two latency ranges. Use the * midpoint latency of each bucket to calculate the average. * For example: * * Bucket Midpoint * 8ns-15ns: 12ns * 16ns-31ns: 24ns * ... */ if (histo[i] != 0) { total += histo[i] * (((1UL << i) + ((1UL << i)/2))); count += histo[i]; } } /* Prevent divide by zero */ return (count == 0 ? 0 : total / count); } static void print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *newnv) { const char *names[] = { ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE, ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, }; struct stat_array *nva; unsigned int column_width = default_column_width(cb, IOS_QUEUES); enum zfs_nicenum_format format; nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), NULL, newnv); if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; for (int i = 0; i < ARRAY_SIZE(names); i++) { uint64_t val = nva[i].data[0]; print_one_stat(val, format, column_width, cb->cb_scripted); } free_calc_stats(nva, ARRAY_SIZE(names)); } static void print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv) { int i; uint64_t val; const char *names[] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, }; struct stat_array *nva; unsigned int column_width = default_column_width(cb, IOS_LATENCY); enum zfs_nicenum_format format; nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv); if (cb->cb_literal) format = ZFS_NICENUM_RAWTIME; else format = ZFS_NICENUM_TIME; /* Print our avg latencies on the line */ for (i = 0; i < ARRAY_SIZE(names); i++) { /* Compute average latency for a latency histo */ val = single_histo_average(nva[i].data, nva[i].count); print_one_stat(val, format, column_width, cb->cb_scripted); } free_calc_stats(nva, ARRAY_SIZE(names)); } /* * Print default statistics (capacity/operations/bandwidth) */ static void print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale) { unsigned int column_width = default_column_width(cb, IOS_DEFAULT); enum zfs_nicenum_format format; char na; /* char to print for "not applicable" values */ if (cb->cb_literal) { format = ZFS_NICENUM_RAW; na = '0'; } else { format = ZFS_NICENUM_1024; na = '-'; } /* only toplevel vdevs have capacity stats */ if (vs->vs_space == 0) { if (cb->cb_scripted) printf("\t%c\t%c", na, na); else printf(" %*c %*c", column_width, na, column_width, na); } else { print_one_stat(vs->vs_alloc, format, column_width, cb->cb_scripted); print_one_stat(vs->vs_space - vs->vs_alloc, format, column_width, cb->cb_scripted); } print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_READ] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_WRITE] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_READ] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_WRITE] * scale), format, column_width, cb->cb_scripted); } static const char *const class_name[] = { VDEV_ALLOC_BIAS_DEDUP, VDEV_ALLOC_BIAS_SPECIAL, VDEV_ALLOC_CLASS_LOGS }; /* * Print out all the statistics for the given vdev. This can either be the * toplevel configuration, or called recursively. If 'name' is NULL, then this * is a verbose output, and we don't want to display the toplevel pool stats. * * Returns the number of stat lines printed. */ static unsigned int print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, nvlist_t *newnv, iostat_cbdata_t *cb, int depth) { nvlist_t **oldchild, **newchild; uint_t c, children, oldchildren; vdev_stat_t *oldvs, *newvs, *calcvs; vdev_stat_t zerovs = { 0 }; char *vname; int i; int ret = 0; uint64_t tdelta; double scale; if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) return (ret); calcvs = safe_malloc(sizeof (*calcvs)); if (oldnv != NULL) { verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0); } else { oldvs = &zerovs; } /* Do we only want to see a specific vdev? */ for (i = 0; i < cb->cb_vdevs.cb_names_count; i++) { /* Yes we do. Is this the vdev? */ if (strcmp(name, cb->cb_vdevs.cb_names[i]) == 0) { /* * This is our vdev. Since it is the only vdev we * will be displaying, make depth = 0 so that it * doesn't get indented. */ depth = 0; break; } } if (cb->cb_vdevs.cb_names_count && (i == cb->cb_vdevs.cb_names_count)) { /* Couldn't match the name */ goto children; } verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&newvs, &c) == 0); /* * Print the vdev name unless it's is a histogram. Histograms * display the vdev name in the header itself. */ if (!(cb->cb_flags & IOS_ANYHISTO_M)) { if (cb->cb_scripted) { printf("%s", name); } else { if (strlen(name) + depth > cb->cb_namewidth) (void) printf("%*s%s", depth, "", name); else (void) printf("%*s%s%*s", depth, "", name, (int)(cb->cb_namewidth - strlen(name) - depth), ""); } } /* Calculate our scaling factor */ tdelta = newvs->vs_timestamp - oldvs->vs_timestamp; if ((oldvs->vs_timestamp == 0) && (cb->cb_flags & IOS_ANYHISTO_M)) { /* * If we specify printing histograms with no time interval, then * print the histogram numbers over the entire lifetime of the * vdev. */ scale = 1; } else { if (tdelta == 0) scale = 1.0; else scale = (double)NANOSEC / tdelta; } if (cb->cb_flags & IOS_DEFAULT_M) { calc_default_iostats(oldvs, newvs, calcvs); print_iostat_default(calcvs, cb, scale); } if (cb->cb_flags & IOS_LATENCY_M) print_iostat_latency(cb, oldnv, newnv); if (cb->cb_flags & IOS_QUEUES_M) print_iostat_queues(cb, newnv); if (cb->cb_flags & IOS_ANYHISTO_M) { printf("\n"); print_iostat_histos(cb, oldnv, newnv, scale, name); } if (cb->vcdl != NULL) { const char *path; if (nvlist_lookup_string(newnv, ZPOOL_CONFIG_PATH, &path) == 0) { printf(" "); zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); } } if (!(cb->cb_flags & IOS_ANYHISTO_M)) printf("\n"); ret++; children: free(calcvs); if (!cb->cb_verbose) return (ret); if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN, &newchild, &children) != 0) return (ret); if (oldnv) { if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN, &oldchild, &oldchildren) != 0) return (ret); children = MIN(oldchildren, children); } /* * print normal top-level devices */ for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE, islog = B_FALSE; (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE, &ishole); (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); if (ishole || islog) continue; if (nvlist_exists(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_vdevs.cb_name_flags | VDEV_NAME_TYPE_ID); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } /* * print all other top-level devices */ for (uint_t n = 0; n < ARRAY_SIZE(class_name); n++) { boolean_t printed = B_FALSE; for (c = 0; c < children; c++) { uint64_t islog = B_FALSE; const char *bias = NULL; const char *type = NULL; (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); if (islog) { bias = VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(newchild[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class_name[n]) != 0) continue; if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && !cb->cb_vdevs.cb_names) { print_iostat_dashes(cb, 0, class_name[n]); } printf("\n"); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_vdevs.cb_name_flags | VDEV_NAME_TYPE_ID); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } /* * Include level 2 ARC devices in iostat output */ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, &newchild, &children) != 0) return (ret); if (oldnv) { if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, &oldchild, &oldchildren) != 0) return (ret); children = MIN(oldchildren, children); } if (children > 0) { if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && !cb->cb_vdevs.cb_names) { print_iostat_dashes(cb, 0, "cache"); } printf("\n"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_vdevs.cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } return (ret); } static int refresh_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; boolean_t missing; /* * If the pool has disappeared, remove it from the list and continue. */ if (zpool_refresh_stats(zhp, &missing) != 0) return (-1); if (missing) pool_list_remove(cb->cb_list, zhp); return (0); } /* * Callback to print out the iostats for the given pool. */ static int print_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; nvlist_t *oldconfig, *newconfig; nvlist_t *oldnvroot, *newnvroot; int ret; newconfig = zpool_get_config(zhp, &oldconfig); if (cb->cb_iteration == 1) oldconfig = NULL; verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE, &newnvroot) == 0); if (oldconfig == NULL) oldnvroot = NULL; else verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE, &oldnvroot) == 0); ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0); if ((ret != 0) && !(cb->cb_flags & IOS_ANYHISTO_M) && !cb->cb_scripted && cb->cb_verbose && !cb->cb_vdevs.cb_names_count) { print_iostat_separator(cb); if (cb->vcdl != NULL) { print_cmd_columns(cb->vcdl, 1); } printf("\n"); } return (ret); } static int get_columns(void) { struct winsize ws; int columns = 80; int error; if (isatty(STDOUT_FILENO)) { error = ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws); if (error == 0) columns = ws.ws_col; } else { columns = 999; } return (columns); } /* * Return the required length of the pool/vdev name column. The minimum * allowed width and output formatting flags must be provided. */ static int get_namewidth(zpool_handle_t *zhp, int min_width, int flags, boolean_t verbose) { nvlist_t *config, *nvroot; int width = min_width; if ((config = zpool_get_config(zhp, NULL)) != NULL) { verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); size_t poolname_len = strlen(zpool_get_name(zhp)); if (verbose == B_FALSE) { width = MAX(poolname_len, min_width); } else { width = MAX(poolname_len, max_width(zhp, nvroot, 0, min_width, flags)); } } return (width); } /* * Parse the input string, get the 'interval' and 'count' value if there is one. */ static void get_interval_count(int *argcp, char **argv, float *iv, unsigned long *cnt) { float interval = 0; unsigned long count = 0; int argc = *argcp; /* * Determine if the last argument is an integer or a pool name */ if (argc > 0 && zfs_isnumber(argv[argc - 1])) { char *end; errno = 0; interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { (void) fprintf(stderr, gettext( "interval cannot be zero\n")); usage(B_FALSE); } /* * Ignore the last parameter */ argc--; } else { /* * If this is not a valid number, just plow on. The * user will get a more informative error message later * on. */ interval = 0; } } /* * If the last argument is also an integer, then we have both a count * and an interval. */ if (argc > 0 && zfs_isnumber(argv[argc - 1])) { char *end; errno = 0; count = interval; interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { (void) fprintf(stderr, gettext( "interval cannot be zero\n")); usage(B_FALSE); } /* * Ignore the last parameter */ argc--; } else { interval = 0; } } *iv = interval; *cnt = count; *argcp = argc; } static void get_timestamp_arg(char c) { if (c == 'u') timestamp_fmt = UDATE; else if (c == 'd') timestamp_fmt = DDATE; else usage(B_FALSE); } /* * Return stat flags that are supported by all pools by both the module and * zpool iostat. "*data" should be initialized to all 0xFFs before running. * It will get ANDed down until only the flags that are supported on all pools * remain. */ static int get_stat_flags_cb(zpool_handle_t *zhp, void *data) { uint64_t *mask = data; nvlist_t *config, *nvroot, *nvx; uint64_t flags = 0; int i, j; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); /* Default stats are always supported, but for completeness.. */ if (nvlist_exists(nvroot, ZPOOL_CONFIG_VDEV_STATS)) flags |= IOS_DEFAULT_M; /* Get our extended stats nvlist from the main list */ if (nvlist_lookup_nvlist(nvroot, ZPOOL_CONFIG_VDEV_STATS_EX, &nvx) != 0) { /* * No extended stats; they're probably running an older * module. No big deal, we support that too. */ goto end; } /* For each extended stat, make sure all its nvpairs are supported */ for (j = 0; j < ARRAY_SIZE(vsx_type_to_nvlist); j++) { if (!vsx_type_to_nvlist[j][0]) continue; /* Start off by assuming the flag is supported, then check */ flags |= (1ULL << j); for (i = 0; vsx_type_to_nvlist[j][i]; i++) { if (!nvlist_exists(nvx, vsx_type_to_nvlist[j][i])) { /* flag isn't supported */ flags = flags & ~(1ULL << j); break; } } } end: *mask = *mask & flags; return (0); } /* * Return a bitmask of stats that are supported on all pools by both the module * and zpool iostat. */ static uint64_t get_stat_flags(zpool_list_t *list) { uint64_t mask = -1; /* * get_stat_flags_cb() will lop off bits from "mask" until only the * flags that are supported on all pools remain. */ pool_list_iter(list, B_FALSE, get_stat_flags_cb, &mask); return (mask); } /* * Return 1 if cb_data->cb_names[0] is this vdev's name, 0 otherwise. */ static int is_vdev_cb(void *zhp_data, nvlist_t *nv, void *cb_data) { uint64_t guid; vdev_cbdata_t *cb = cb_data; zpool_handle_t *zhp = zhp_data; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (0); return (guid == zpool_vdev_path_to_guid(zhp, cb->cb_names[0])); } /* * Returns 1 if cb_data->cb_names[0] is a vdev name, 0 otherwise. */ static int is_vdev(zpool_handle_t *zhp, void *cb_data) { return (for_each_vdev(zhp, is_vdev_cb, cb_data)); } /* * Check if vdevs are in a pool * * Return 1 if all argv[] strings are vdev names in pool "pool_name". Otherwise * return 0. If pool_name is NULL, then search all pools. */ static int are_vdevs_in_pool(int argc, char **argv, char *pool_name, vdev_cbdata_t *cb) { char **tmp_name; int ret = 0; int i; int pool_count = 0; if ((argc == 0) || !*argv) return (0); if (pool_name) pool_count = 1; /* Temporarily hijack cb_names for a second... */ tmp_name = cb->cb_names; /* Go though our list of prospective vdev names */ for (i = 0; i < argc; i++) { cb->cb_names = argv + i; /* Is this name a vdev in our pools? */ ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, is_vdev, cb); if (!ret) { /* No match */ break; } } cb->cb_names = tmp_name; return (ret); } static int is_pool_cb(zpool_handle_t *zhp, void *data) { char *name = data; if (strcmp(name, zpool_get_name(zhp)) == 0) return (1); return (0); } /* * Do we have a pool named *name? If so, return 1, otherwise 0. */ static int is_pool(char *name) { return (for_each_pool(0, NULL, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, is_pool_cb, name)); } /* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */ static int are_all_pools(int argc, char **argv) { if ((argc == 0) || !*argv) return (0); while (--argc >= 0) if (!is_pool(argv[argc])) return (0); return (1); } /* * Helper function to print out vdev/pool names we can't resolve. Used for an * error message. */ static void error_list_unresolved_vdevs(int argc, char **argv, char *pool_name, vdev_cbdata_t *cb) { int i; char *name; char *str; for (i = 0; i < argc; i++) { name = argv[i]; if (is_pool(name)) str = gettext("pool"); else if (are_vdevs_in_pool(1, &name, pool_name, cb)) str = gettext("vdev in this pool"); else if (are_vdevs_in_pool(1, &name, NULL, cb)) str = gettext("vdev in another pool"); else str = gettext("unknown"); fprintf(stderr, "\t%s (%s)\n", name, str); } } /* * Same as get_interval_count(), but with additional checks to not misinterpret * guids as interval/count values. Assumes VDEV_NAME_GUID is set in * cb.cb_vdevs.cb_name_flags. */ static void get_interval_count_filter_guids(int *argc, char **argv, float *interval, unsigned long *count, iostat_cbdata_t *cb) { char **tmpargv = argv; int argc_for_interval = 0; /* Is the last arg an interval value? Or a guid? */ if (*argc >= 1 && !are_vdevs_in_pool(1, &argv[*argc - 1], NULL, &cb->cb_vdevs)) { /* * The last arg is not a guid, so it's probably an * interval value. */ argc_for_interval++; if (*argc >= 2 && !are_vdevs_in_pool(1, &argv[*argc - 2], NULL, &cb->cb_vdevs)) { /* * The 2nd to last arg is not a guid, so it's probably * an interval value. */ argc_for_interval++; } } /* Point to our list of possible intervals */ tmpargv = &argv[*argc - argc_for_interval]; *argc = *argc - argc_for_interval; get_interval_count(&argc_for_interval, tmpargv, interval, count); } /* * Floating point sleep(). Allows you to pass in a floating point value for * seconds. */ static void fsleep(float sec) { struct timespec req; req.tv_sec = floor(sec); req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC; nanosleep(&req, NULL); } /* * Terminal height, in rows. Returns -1 if stdout is not connected to a TTY or * if we were unable to determine its size. */ static int terminal_height(void) { struct winsize win; if (isatty(STDOUT_FILENO) == 0) return (-1); if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &win) != -1 && win.ws_row > 0) return (win.ws_row); return (-1); } /* * Run one of the zpool status/iostat -c scripts with the help (-h) option and * print the result. * * name: Short name of the script ('iostat'). * path: Full path to the script ('/usr/local/etc/zfs/zpool.d/iostat'); */ static void print_zpool_script_help(char *name, char *path) { char *argv[] = {path, (char *)"-h", NULL}; char **lines = NULL; int lines_cnt = 0; int rc; rc = libzfs_run_process_get_stdout_nopath(path, argv, NULL, &lines, &lines_cnt); if (rc != 0 || lines == NULL || lines_cnt <= 0) { if (lines != NULL) libzfs_free_str_array(lines, lines_cnt); return; } for (int i = 0; i < lines_cnt; i++) if (!is_blank_str(lines[i])) printf(" %-14s %s\n", name, lines[i]); libzfs_free_str_array(lines, lines_cnt); } /* * Go though the zpool status/iostat -c scripts in the user's path, run their * help option (-h), and print out the results. */ static void print_zpool_dir_scripts(char *dirpath) { DIR *dir; struct dirent *ent; char fullpath[MAXPATHLEN]; struct stat dir_stat; if ((dir = opendir(dirpath)) != NULL) { /* print all the files and directories within directory */ while ((ent = readdir(dir)) != NULL) { if (snprintf(fullpath, sizeof (fullpath), "%s/%s", dirpath, ent->d_name) >= sizeof (fullpath)) { (void) fprintf(stderr, gettext("internal error: " "ZPOOL_SCRIPTS_PATH too large.\n")); exit(1); } /* Print the scripts */ if (stat(fullpath, &dir_stat) == 0) if (dir_stat.st_mode & S_IXUSR && S_ISREG(dir_stat.st_mode)) print_zpool_script_help(ent->d_name, fullpath); } closedir(dir); } } /* * Print out help text for all zpool status/iostat -c scripts. */ static void print_zpool_script_list(const char *subcommand) { char *dir, *sp, *tmp; printf(gettext("Available 'zpool %s -c' commands:\n"), subcommand); sp = zpool_get_cmd_search_path(); if (sp == NULL) return; for (dir = strtok_r(sp, ":", &tmp); dir != NULL; dir = strtok_r(NULL, ":", &tmp)) print_zpool_dir_scripts(dir); free(sp); } /* * Set the minimum pool/vdev name column width. The width must be at least 10, * but may be as large as the column width - 42 so it still fits on one line. * NOTE: 42 is the width of the default capacity/operations/bandwidth output */ static int get_namewidth_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; int width, available_width; /* * get_namewidth() returns the maximum width of any name in that column * for any pool/vdev/device line that will be output. */ width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_vdevs.cb_name_flags | VDEV_NAME_TYPE_ID, cb->cb_verbose); /* * The width we are calculating is the width of the header and also the * padding width for names that are less than maximum width. The stats * take up 42 characters, so the width available for names is: */ available_width = get_columns() - 42; /* * If the maximum width fits on a screen, then great! Make everything * line up by justifying all lines to the same width. If that max * width is larger than what's available, the name plus stats won't fit * on one line, and justifying to that width would cause every line to * wrap on the screen. We only want lines with long names to wrap. * Limit the padding to what won't wrap. */ if (width > available_width) width = available_width; /* * And regardless of whatever the screen width is (get_columns can * return 0 if the width is not known or less than 42 for a narrow * terminal) have the width be a minimum of 10. */ if (width < 10) width = 10; /* Save the calculated width */ cb->cb_namewidth = width; return (0); } /* * zpool iostat [[-c [script1,script2,...]] [-lq]|[-rw]] [-ghHLpPvy] [-n name] * [-T d|u] [[ pool ...]|[pool vdev ...]|[vdev ...]] * [interval [count]] * * -c CMD For each vdev, run command CMD * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -P Display full path for vdev name. * -v Display statistics for individual vdevs * -h Display help * -p Display values in parsable (exact) format. * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -l Display average latency * -q Display queue depths * -w Display latency histograms * -r Display request size histogram * -T Display a timestamp in date(1) or Unix format * -n Only print headers once * * This command can be tricky because we want to be able to deal with pool * creation/destruction as well as vdev configuration changes. The bulk of this * processing is handled by the pool_list_* routines in zpool_iter.c. We rely * on pool_list_update() to detect the addition of new pools. Configuration * changes are all handled within libzfs. */ int zpool_do_iostat(int argc, char **argv) { int c; int ret; int npools; float interval = 0; unsigned long count = 0; int winheight = 24; zpool_list_t *list; boolean_t verbose = B_FALSE; boolean_t latency = B_FALSE, l_histo = B_FALSE, rq_histo = B_FALSE; boolean_t queues = B_FALSE, parsable = B_FALSE, scripted = B_FALSE; boolean_t omit_since_boot = B_FALSE; boolean_t guid = B_FALSE; boolean_t follow_links = B_FALSE; boolean_t full_name = B_FALSE; boolean_t headers_once = B_FALSE; iostat_cbdata_t cb = { 0 }; char *cmd = NULL; /* Used for printing error message */ const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q', [IOS_L_HISTO] = 'w', [IOS_RQ_HISTO] = 'r'}; uint64_t unsupported_flags; /* check options */ while ((c = getopt(argc, argv, "c:gLPT:vyhplqrwnH")) != -1) { switch (c) { case 'c': if (cmd != NULL) { fprintf(stderr, gettext("Can't set -c flag twice\n")); exit(1); } if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) { fprintf(stderr, gettext( "Can't run -c, disabled by " "ZPOOL_SCRIPTS_ENABLED.\n")); exit(1); } if ((getuid() <= 0 || geteuid() <= 0) && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) { fprintf(stderr, gettext( "Can't run -c with root privileges " "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n")); exit(1); } cmd = optarg; verbose = B_TRUE; break; case 'g': guid = B_TRUE; break; case 'L': follow_links = B_TRUE; break; case 'P': full_name = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 'v': verbose = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 'l': latency = B_TRUE; break; case 'q': queues = B_TRUE; break; case 'H': scripted = B_TRUE; break; case 'w': l_histo = B_TRUE; break; case 'r': rq_histo = B_TRUE; break; case 'y': omit_since_boot = B_TRUE; break; case 'n': headers_once = B_TRUE; break; case 'h': usage(B_FALSE); break; case '?': if (optopt == 'c') { print_zpool_script_list("iostat"); exit(0); } else { fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } usage(B_FALSE); } } argc -= optind; argv += optind; cb.cb_literal = parsable; cb.cb_scripted = scripted; if (guid) cb.cb_vdevs.cb_name_flags |= VDEV_NAME_GUID; if (follow_links) cb.cb_vdevs.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; if (full_name) cb.cb_vdevs.cb_name_flags |= VDEV_NAME_PATH; cb.cb_iteration = 0; cb.cb_namewidth = 0; cb.cb_verbose = verbose; /* Get our interval and count values (if any) */ if (guid) { get_interval_count_filter_guids(&argc, argv, &interval, &count, &cb); } else { get_interval_count(&argc, argv, &interval, &count); } if (argc == 0) { /* No args, so just print the defaults. */ } else if (are_all_pools(argc, argv)) { /* All the args are pool names */ } else if (are_vdevs_in_pool(argc, argv, NULL, &cb.cb_vdevs)) { /* All the args are vdevs */ cb.cb_vdevs.cb_names = argv; cb.cb_vdevs.cb_names_count = argc; argc = 0; /* No pools to process */ } else if (are_all_pools(1, argv)) { /* The first arg is a pool name */ if (are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb.cb_vdevs)) { /* ...and the rest are vdev names */ cb.cb_vdevs.cb_names = argv + 1; cb.cb_vdevs.cb_names_count = argc - 1; argc = 1; /* One pool to process */ } else { fprintf(stderr, gettext("Expected either a list of ")); fprintf(stderr, gettext("pools, or list of vdevs in")); fprintf(stderr, " \"%s\", ", argv[0]); fprintf(stderr, gettext("but got:\n")); error_list_unresolved_vdevs(argc - 1, argv + 1, argv[0], &cb.cb_vdevs); fprintf(stderr, "\n"); usage(B_FALSE); return (1); } } else { /* * The args don't make sense. The first arg isn't a pool name, * nor are all the args vdevs. */ fprintf(stderr, gettext("Unable to parse pools/vdevs list.\n")); fprintf(stderr, "\n"); return (1); } if (cb.cb_vdevs.cb_names_count != 0) { /* * If user specified vdevs, it implies verbose. */ cb.cb_verbose = B_TRUE; } /* * Construct the list of all interesting pools. */ ret = 0; if ((list = pool_list_get(argc, argv, NULL, ZFS_TYPE_POOL, parsable, &ret)) == NULL) return (1); if (pool_list_count(list) == 0 && argc != 0) { pool_list_free(list); return (1); } if (pool_list_count(list) == 0 && interval == 0) { pool_list_free(list); (void) fprintf(stderr, gettext("no pools available\n")); return (1); } if ((l_histo || rq_histo) && (cmd != NULL || latency || queues)) { pool_list_free(list); (void) fprintf(stderr, gettext("[-r|-w] isn't allowed with [-c|-l|-q]\n")); usage(B_FALSE); return (1); } if (l_histo && rq_histo) { pool_list_free(list); (void) fprintf(stderr, gettext("Only one of [-r|-w] can be passed at a time\n")); usage(B_FALSE); return (1); } /* * Enter the main iostat loop. */ cb.cb_list = list; if (l_histo) { /* * Histograms tables look out of place when you try to display * them with the other stats, so make a rule that you can only * print histograms by themselves. */ cb.cb_flags = IOS_L_HISTO_M; } else if (rq_histo) { cb.cb_flags = IOS_RQ_HISTO_M; } else { cb.cb_flags = IOS_DEFAULT_M; if (latency) cb.cb_flags |= IOS_LATENCY_M; if (queues) cb.cb_flags |= IOS_QUEUES_M; } /* * See if the module supports all the stats we want to display. */ unsupported_flags = cb.cb_flags & ~get_stat_flags(list); if (unsupported_flags) { uint64_t f; int idx; fprintf(stderr, gettext("The loaded zfs module doesn't support:")); /* for each bit set in unsupported_flags */ for (f = unsupported_flags; f; f &= ~(1ULL << idx)) { idx = lowbit64(f) - 1; fprintf(stderr, " -%c", flag_to_arg[idx]); } fprintf(stderr, ". Try running a newer module.\n"); pool_list_free(list); return (1); } for (;;) { if ((npools = pool_list_count(list)) == 0) (void) fprintf(stderr, gettext("no pools available\n")); else { /* * If this is the first iteration and -y was supplied * we skip any printing. */ boolean_t skip = (omit_since_boot && cb.cb_iteration == 0); /* * Refresh all statistics. This is done as an * explicit step before calculating the maximum name * width, so that any * configuration changes are * properly accounted for. */ (void) pool_list_iter(list, B_FALSE, refresh_iostat, &cb); /* * Iterate over all pools to determine the maximum width * for the pool / device name column across all pools. */ cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth_iostat, &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (cmd != NULL && cb.cb_verbose && !(cb.cb_flags & IOS_ANYHISTO_M)) { cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, g_zfs, cb.cb_vdevs.cb_names, cb.cb_vdevs.cb_names_count, cb.cb_vdevs.cb_name_flags); } else { cb.vcdl = NULL; } /* * Check terminal size so we can print headers * even when terminal window has its height * changed. */ winheight = terminal_height(); /* * Are we connected to TTY? If not, headers_once * should be true, to avoid breaking scripts. */ if (winheight < 0) headers_once = B_TRUE; /* * If it's the first time and we're not skipping it, * or either skip or verbose mode, print the header. * * The histogram code explicitly prints its header on * every vdev, so skip this for histograms. */ if (((++cb.cb_iteration == 1 && !skip) || (skip != verbose) || (!headers_once && (cb.cb_iteration % winheight) == 0)) && (!(cb.cb_flags & IOS_ANYHISTO_M)) && !cb.cb_scripted) print_iostat_header(&cb); if (skip) { (void) fsleep(interval); continue; } pool_list_iter(list, B_FALSE, print_iostat, &cb); /* * If there's more than one pool, and we're not in * verbose mode (which prints a separator for us), * then print a separator. * * In addition, if we're printing specific vdevs then * we also want an ending separator. */ if (((npools > 1 && !verbose && !(cb.cb_flags & IOS_ANYHISTO_M)) || (!(cb.cb_flags & IOS_ANYHISTO_M) && cb.cb_vdevs.cb_names_count)) && !cb.cb_scripted) { print_iostat_separator(&cb); if (cb.vcdl != NULL) print_cmd_columns(cb.vcdl, 1); printf("\n"); } if (cb.vcdl != NULL) free_vdev_cmd_data_list(cb.vcdl); } /* * Flush the output so that redirection to a file isn't buffered * indefinitely. */ (void) fflush(stdout); if (interval == 0) break; if (count != 0 && --count == 0) break; (void) fsleep(interval); } pool_list_free(list); return (ret); } typedef struct list_cbdata { boolean_t cb_verbose; int cb_name_flags; int cb_namewidth; boolean_t cb_scripted; zprop_list_t *cb_proplist; boolean_t cb_literal; } list_cbdata_t; /* * Given a list of columns to display, output appropriate headers for each one. */ static void print_header(list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; char headerbuf[ZPOOL_MAXPROPLEN]; const char *header; boolean_t first = B_TRUE; boolean_t right_justify; size_t width = 0; for (; pl != NULL; pl = pl->pl_next) { width = pl->pl_width; if (first && cb->cb_verbose) { /* * Reset the width to accommodate the verbose listing * of devices. */ width = cb->cb_namewidth; } if (!first) (void) fputs(" ", stdout); else first = B_FALSE; right_justify = B_FALSE; if (pl->pl_prop != ZPROP_USERPROP) { header = zpool_prop_column_name(pl->pl_prop); right_justify = zpool_prop_align_right(pl->pl_prop); } else { int i; for (i = 0; pl->pl_user_prop[i] != '\0'; i++) headerbuf[i] = toupper(pl->pl_user_prop[i]); headerbuf[i] = '\0'; header = headerbuf; } if (pl->pl_next == NULL && !right_justify) (void) fputs(header, stdout); else if (right_justify) (void) printf("%*s", (int)width, header); else (void) printf("%-*s", (int)width, header); } (void) fputc('\n', stdout); } /* * Given a pool and a list of properties, print out all the properties according * to the described layout. Used by zpool_do_list(). */ static void print_pool(zpool_handle_t *zhp, list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; boolean_t first = B_TRUE; char property[ZPOOL_MAXPROPLEN]; const char *propstr; boolean_t right_justify; size_t width; for (; pl != NULL; pl = pl->pl_next) { width = pl->pl_width; if (first && cb->cb_verbose) { /* * Reset the width to accommodate the verbose listing * of devices. */ width = cb->cb_namewidth; } if (!first) { if (cb->cb_scripted) (void) fputc('\t', stdout); else (void) fputs(" ", stdout); } else { first = B_FALSE; } right_justify = B_FALSE; if (pl->pl_prop != ZPROP_USERPROP) { if (zpool_get_prop(zhp, pl->pl_prop, property, sizeof (property), NULL, cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = zpool_prop_align_right(pl->pl_prop); } else if ((zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop)) && zpool_prop_get_feature(zhp, pl->pl_user_prop, property, sizeof (property)) == 0) { propstr = property; } else if (zfs_prop_user(pl->pl_user_prop) && zpool_get_userprop(zhp, pl->pl_user_prop, property, sizeof (property), NULL) == 0) { propstr = property; } else { propstr = "-"; } /* * If this is being called in scripted mode, or if this is the * last column and it is left-justified, don't include a width * format specifier. */ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) (void) fputs(propstr, stdout); else if (right_justify) (void) printf("%*s", (int)width, propstr); else (void) printf("%-*s", (int)width, propstr); } (void) fputc('\n', stdout); } static void print_one_column(zpool_prop_t prop, uint64_t value, const char *str, boolean_t scripted, boolean_t valid, enum zfs_nicenum_format format) { char propval[64]; boolean_t fixed; size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL); switch (prop) { case ZPOOL_PROP_SIZE: case ZPOOL_PROP_EXPANDSZ: case ZPOOL_PROP_CHECKPOINT: case ZPOOL_PROP_DEDUPRATIO: if (value == 0) (void) strlcpy(propval, "-", sizeof (propval)); else zfs_nicenum_format(value, propval, sizeof (propval), format); break; case ZPOOL_PROP_FRAGMENTATION: if (value == ZFS_FRAG_INVALID) { (void) strlcpy(propval, "-", sizeof (propval)); } else if (format == ZFS_NICENUM_RAW) { (void) snprintf(propval, sizeof (propval), "%llu", (unsigned long long)value); } else { (void) snprintf(propval, sizeof (propval), "%llu%%", (unsigned long long)value); } break; case ZPOOL_PROP_CAPACITY: /* capacity value is in parts-per-10,000 (aka permyriad) */ if (format == ZFS_NICENUM_RAW) (void) snprintf(propval, sizeof (propval), "%llu", (unsigned long long)value / 100); else (void) snprintf(propval, sizeof (propval), value < 1000 ? "%1.2f%%" : value < 10000 ? "%2.1f%%" : "%3.0f%%", value / 100.0); break; case ZPOOL_PROP_HEALTH: width = 8; (void) strlcpy(propval, str, sizeof (propval)); break; default: zfs_nicenum_format(value, propval, sizeof (propval), format); } if (!valid) (void) strlcpy(propval, "-", sizeof (propval)); if (scripted) (void) printf("\t%s", propval); else (void) printf(" %*s", (int)width, propval); } /* * print static default line per vdev * not compatible with '-o' option */ static void print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, list_cbdata_t *cb, int depth, boolean_t isspare) { nvlist_t **child; vdev_stat_t *vs; uint_t c, children; char *vname; boolean_t scripted = cb->cb_scripted; uint64_t islog = B_FALSE; const char *dashes = "%-*s - - - - " "- - - - -\n"; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (name != NULL) { boolean_t toplevel = (vs->vs_space != 0); uint64_t cap; enum zfs_nicenum_format format; const char *state; if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) return; if (scripted) (void) printf("\t%s", name); else if (strlen(name) + depth > cb->cb_namewidth) (void) printf("%*s%s", depth, "", name); else (void) printf("%*s%s%*s", depth, "", name, (int)(cb->cb_namewidth - strlen(name) - depth), ""); /* * Print the properties for the individual vdevs. Some * properties are only applicable to toplevel vdevs. The * 'toplevel' boolean value is passed to the print_one_column() * to indicate that the value is valid. */ if (VDEV_STAT_VALID(vs_pspace, c) && vs->vs_pspace) print_one_column(ZPOOL_PROP_SIZE, vs->vs_pspace, NULL, scripted, B_TRUE, format); else print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_CHECKPOINT, vs->vs_checkpoint_space, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, NULL, scripted, B_TRUE, format); print_one_column(ZPOOL_PROP_FRAGMENTATION, vs->vs_fragmentation, NULL, scripted, (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel), format); cap = (vs->vs_space == 0) ? 0 : (vs->vs_alloc * 10000 / vs->vs_space); print_one_column(ZPOOL_PROP_CAPACITY, cap, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_DEDUPRATIO, 0, NULL, scripted, toplevel, format); state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { if (vs->vs_aux == VDEV_AUX_SPARED) state = "INUSE"; else if (vs->vs_state == VDEV_STATE_HEALTHY) state = "AVAIL"; } print_one_column(ZPOOL_PROP_HEALTH, 0, state, scripted, B_TRUE, format); (void) fputc('\n', stdout); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; /* list the normal vdevs first */ for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole) continue; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) continue; if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE); free(vname); } /* list the classes: 'logs', 'dedup', and 'special' */ for (uint_t n = 0; n < ARRAY_SIZE(class_name); n++) { boolean_t printed = B_FALSE; for (c = 0; c < children; c++) { const char *bias = NULL; const char *type = NULL; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) { bias = VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class_name[n]) != 0) continue; if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, class_name[n]); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0 && children > 0) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "cache"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0 && children > 0) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "spare"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_TRUE); free(vname); } } } /* * Generic callback function to list a pool. */ static int list_callback(zpool_handle_t *zhp, void *data) { list_cbdata_t *cbp = data; print_pool(zhp, cbp); if (cbp->cb_verbose) { nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); print_list_stats(zhp, NULL, nvroot, cbp, 0, B_FALSE); } return (0); } /* * Set the minimum pool/vdev name column width. The width must be at least 9, * but may be as large as needed. */ static int get_namewidth_list(zpool_handle_t *zhp, void *data) { list_cbdata_t *cb = data; int width; width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags | VDEV_NAME_TYPE_ID, cb->cb_verbose); if (width < 9) width = 9; cb->cb_namewidth = width; return (0); } /* * zpool list [-gHLpP] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]] * * -g Display guid for individual vdev name. * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -L Follow links when resolving vdev path name. * -o List of properties to display. Defaults to * "name,size,allocated,free,expandsize,fragmentation,capacity," * "dedupratio,health,altroot" * -p Display values in parsable (exact) format. * -P Display full path for vdev name. * -T Display a timestamp in date(1) or Unix format * * List all pools in the system, whether or not they're healthy. Output space * statistics for each one, as well as health status summary. */ int zpool_do_list(int argc, char **argv) { int c; int ret = 0; list_cbdata_t cb = { 0 }; static char default_props[] = "name,size,allocated,free,checkpoint,expandsize,fragmentation," "capacity,dedupratio,health,altroot"; char *props = default_props; float interval = 0; unsigned long count = 0; zpool_list_t *list; boolean_t first = B_TRUE; current_prop_type = ZFS_TYPE_POOL; /* check options */ while ((c = getopt(argc, argv, ":gHLo:pPT:v")) != -1) { switch (c) { case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; case 'H': cb.cb_scripted = B_TRUE; break; case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'o': props = optarg; break; case 'P': cb.cb_name_flags |= VDEV_NAME_PATH; break; case 'p': cb.cb_literal = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 'v': cb.cb_verbose = B_TRUE; cb.cb_namewidth = 8; /* 8 until precalc is avail */ break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &interval, &count); if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); for (;;) { if ((list = pool_list_get(argc, argv, &cb.cb_proplist, ZFS_TYPE_POOL, cb.cb_literal, &ret)) == NULL) return (1); if (pool_list_count(list) == 0) break; cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth_list, &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (!cb.cb_scripted && (first || cb.cb_verbose)) { print_header(&cb); first = B_FALSE; } ret = pool_list_iter(list, B_TRUE, list_callback, &cb); if (interval == 0) break; if (count != 0 && --count == 0) break; pool_list_free(list); (void) fsleep(interval); } if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) { (void) printf(gettext("no pools available\n")); ret = 0; } pool_list_free(list); zprop_free_list(cb.cb_proplist); return (ret); } static int zpool_do_attach_or_replace(int argc, char **argv, int replacing) { boolean_t force = B_FALSE; boolean_t rebuild = B_FALSE; boolean_t wait = B_FALSE; int c; nvlist_t *nvroot; char *poolname, *old_disk, *new_disk; zpool_handle_t *zhp; nvlist_t *props = NULL; char *propval; int ret; /* check options */ while ((c = getopt(argc, argv, "fo:sw")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); usage(B_FALSE); } *propval = '\0'; propval++; if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; case 's': rebuild = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } poolname = argv[0]; if (argc < 2) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } old_disk = argv[1]; if (argc < 3) { if (!replacing) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } new_disk = old_disk; argc -= 1; argv += 1; } else { new_disk = argv[2]; argc -= 2; argv += 2; } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { nvlist_free(props); return (1); } if (zpool_get_config(zhp, NULL) == NULL) { (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), poolname); zpool_close(zhp); nvlist_free(props); return (1); } /* unless manually specified use "ashift" pool property (if set) */ if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) { int intval; zprop_source_t src; char strval[ZPOOL_MAXPROPLEN]; intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src); if (src != ZPROP_SRC_DEFAULT) { (void) sprintf(strval, "%" PRId32, intval); verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval, &props, B_TRUE) == 0); } } nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE, argc, argv); if (nvroot == NULL) { zpool_close(zhp); nvlist_free(props); return (1); } ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing, rebuild); if (ret == 0 && wait) ret = zpool_wait(zhp, replacing ? ZPOOL_WAIT_REPLACE : ZPOOL_WAIT_RESILVER); nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); return (ret); } /* * zpool replace [-fsw] [-o property=value] * * -f Force attach, even if appears to be in use. * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. * -w Wait for replacing to complete before returning * * Replace with . */ int zpool_do_replace(int argc, char **argv) { return (zpool_do_attach_or_replace(argc, argv, B_TRUE)); } /* * zpool attach [-fsw] [-o property=value] * * -f Force attach, even if appears to be in use. * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. * -w Wait for resilvering to complete before returning * * Attach to the mirror containing . If is not * part of a mirror, then will be transformed into a mirror of * and . In either case, will begin life * with a DTL of [0, now], and will immediately begin to resilver itself. */ int zpool_do_attach(int argc, char **argv) { return (zpool_do_attach_or_replace(argc, argv, B_FALSE)); } /* * zpool detach [-f] * * -f Force detach of , even if DTLs argue against it * (not supported yet) * * Detach a device from a mirror. The operation will be refused if * is the last device in the mirror, or if the DTLs indicate that this device * has the only valid copy of some data. */ int zpool_do_detach(int argc, char **argv) { int c; char *poolname, *path; zpool_handle_t *zhp; int ret; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } poolname = argv[0]; path = argv[1]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); ret = zpool_vdev_detach(zhp, path); zpool_close(zhp); return (ret); } /* * zpool split [-gLnP] [-o prop=val] ... * [-o mntopt] ... * [-R altroot] [ ...] * * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -n Do not split the pool, but display the resulting layout if * it were to be split. * -o Set property=value, or set mount options. * -P Display full path for vdev name. * -R Mount the split-off pool under an alternate root. * -l Load encryption keys while importing. * * Splits the named pool and gives it the new pool name. Devices to be split * off may be listed, provided that no more than one device is specified * per top-level vdev mirror. The newly split pool is left in an exported * state unless -R is specified. * * Restrictions: the top-level of the pool pool must only be made up of * mirrors; all devices in the pool must be healthy; no device may be * undergoing a resilvering operation. */ int zpool_do_split(int argc, char **argv) { char *srcpool, *newpool, *propval; char *mntopts = NULL; splitflags_t flags; int c, ret = 0; int ms_status = 0; boolean_t loadkeys = B_FALSE; zpool_handle_t *zhp; nvlist_t *config, *props = NULL; flags.dryrun = B_FALSE; flags.import = B_FALSE; flags.name_flags = 0; /* check options */ while ((c = getopt(argc, argv, ":gLR:lno:P")) != -1) { switch (c) { case 'g': flags.name_flags |= VDEV_NAME_GUID; break; case 'L': flags.name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'R': flags.import = B_TRUE; if (add_prop_list( zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE) != 0) { nvlist_free(props); usage(B_FALSE); } break; case 'l': loadkeys = B_TRUE; break; case 'n': flags.dryrun = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE) != 0) { nvlist_free(props); usage(B_FALSE); } } else { mntopts = optarg; } break; case 'P': flags.name_flags |= VDEV_NAME_PATH; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); break; } } if (!flags.import && mntopts != NULL) { (void) fprintf(stderr, gettext("setting mntopts is only " "valid when importing the pool\n")); usage(B_FALSE); } if (!flags.import && loadkeys) { (void) fprintf(stderr, gettext("loading keys is only " "valid when importing the pool\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("Missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("Missing new pool name\n")); usage(B_FALSE); } srcpool = argv[0]; newpool = argv[1]; argc -= 2; argv += 2; if ((zhp = zpool_open(g_zfs, srcpool)) == NULL) { nvlist_free(props); return (1); } config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv); if (config == NULL) { ret = 1; } else { if (flags.dryrun) { (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), newpool); print_vdev_tree(NULL, newpool, config, 0, "", flags.name_flags); print_vdev_tree(NULL, "dedup", config, 0, VDEV_ALLOC_BIAS_DEDUP, 0); print_vdev_tree(NULL, "special", config, 0, VDEV_ALLOC_BIAS_SPECIAL, 0); } } zpool_close(zhp); if (ret != 0 || flags.dryrun || !flags.import) { nvlist_free(config); nvlist_free(props); return (ret); } /* * The split was successful. Now we need to open the new * pool and import it. */ if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL) { nvlist_free(config); nvlist_free(props); return (1); } if (loadkeys) { ret = zfs_crypto_attempt_load_keys(g_zfs, newpool); if (ret != 0) ret = 1; } if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { ms_status = zpool_enable_datasets(zhp, mntopts, 0); if (ms_status == EZFS_SHAREFAILED) { (void) fprintf(stderr, gettext("Split was successful, " "datasets are mounted but sharing of some datasets " "has failed\n")); } else if (ms_status == EZFS_MOUNTFAILED) { (void) fprintf(stderr, gettext("Split was successful" ", but some datasets could not be mounted\n")); (void) fprintf(stderr, gettext("Try doing '%s' with a " "different altroot\n"), "zpool import"); } } zpool_close(zhp); nvlist_free(config); nvlist_free(props); return (ret); } /* * zpool online ... */ int zpool_do_online(int argc, char **argv) { int c, i; char *poolname; zpool_handle_t *zhp; int ret = 0; vdev_state_t newstate; int flags = 0; /* check options */ while ((c = getopt(argc, argv, "e")) != -1) { switch (c) { case 'e': flags |= ZFS_ONLINE_EXPAND; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device name\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { vdev_state_t oldstate; boolean_t avail_spare, l2cache; nvlist_t *tgt = zpool_find_vdev(zhp, argv[i], &avail_spare, &l2cache, NULL); if (tgt == NULL) { ret = 1; continue; } uint_t vsc; oldstate = ((vdev_stat_t *)fnvlist_lookup_uint64_array(tgt, ZPOOL_CONFIG_VDEV_STATS, &vsc))->vs_state; if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) { if (newstate != VDEV_STATE_HEALTHY) { (void) printf(gettext("warning: device '%s' " "onlined, but remains in faulted state\n"), argv[i]); if (newstate == VDEV_STATE_FAULTED) (void) printf(gettext("use 'zpool " "clear' to restore a faulted " "device\n")); else (void) printf(gettext("use 'zpool " "replace' to replace devices " "that are no longer present\n")); if ((flags & ZFS_ONLINE_EXPAND)) { (void) printf(gettext("%s: failed " "to expand usable space on " "unhealthy device '%s'\n"), (oldstate >= VDEV_STATE_DEGRADED ? "error" : "warning"), argv[i]); if (oldstate >= VDEV_STATE_DEGRADED) { ret = 1; break; } } } } else { ret = 1; } } zpool_close(zhp); return (ret); } /* * zpool offline [-ft] ... * * -f Force the device into a faulted state. * * -t Only take the device off-line temporarily. The offline/faulted * state will not be persistent across reboots. */ int zpool_do_offline(int argc, char **argv) { int c, i; char *poolname; zpool_handle_t *zhp; int ret = 0; boolean_t istmp = B_FALSE; boolean_t fault = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "ft")) != -1) { switch (c) { case 'f': fault = B_TRUE; break; case 't': istmp = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device name\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { if (fault) { uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]); vdev_aux_t aux; if (istmp == B_FALSE) { /* Force the fault to persist across imports */ aux = VDEV_AUX_EXTERNAL_PERSIST; } else { aux = VDEV_AUX_EXTERNAL; } if (guid == 0 || zpool_vdev_fault(zhp, guid, aux) != 0) ret = 1; } else { if (zpool_vdev_offline(zhp, argv[i], istmp) != 0) ret = 1; } } zpool_close(zhp); return (ret); } /* * zpool clear [device] * * Clear all errors associated with a pool or a particular device. */ int zpool_do_clear(int argc, char **argv) { int c; int ret = 0; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; uint32_t rewind_policy = ZPOOL_NO_REWIND; nvlist_t *policy = NULL; zpool_handle_t *zhp; char *pool, *device; /* check options */ while ((c = getopt(argc, argv, "FnX")) != -1) { switch (c) { case 'F': do_rewind = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case 'X': xtreme_rewind = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((dryrun || xtreme_rewind) && !do_rewind) { (void) fprintf(stderr, gettext("-n or -X only meaningful with -F\n")); usage(B_FALSE); } if (dryrun) rewind_policy = ZPOOL_TRY_REWIND; else if (do_rewind) rewind_policy = ZPOOL_DO_REWIND; if (xtreme_rewind) rewind_policy |= ZPOOL_EXTREME_REWIND; /* In future, further rewind policy choices can be passed along here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind_policy) != 0) { return (1); } pool = argv[0]; device = argc == 2 ? argv[1] : NULL; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { nvlist_free(policy); return (1); } if (zpool_clear(zhp, device, policy) != 0) ret = 1; zpool_close(zhp); nvlist_free(policy); return (ret); } /* * zpool reguid */ int zpool_do_reguid(int argc, char **argv) { int c; char *poolname; zpool_handle_t *zhp; int ret = 0; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); ret = zpool_reguid(zhp); zpool_close(zhp); return (ret); } /* * zpool reopen * * Reopen the pool so that the kernel can update the sizes of all vdevs. */ int zpool_do_reopen(int argc, char **argv) { int c; int ret = 0; boolean_t scrub_restart = B_TRUE; /* check options */ while ((c = getopt(argc, argv, "n")) != -1) { switch (c) { case 'n': scrub_restart = B_FALSE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* if argc == 0 we will execute zpool_reopen_one on all pools */ ret = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, zpool_reopen_one, &scrub_restart); return (ret); } typedef struct scrub_cbdata { int cb_type; pool_scrub_cmd_t cb_scrub_cmd; } scrub_cbdata_t; static boolean_t zpool_has_checkpoint(zpool_handle_t *zhp) { nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); if (config != NULL) { pool_checkpoint_stat_t *pcs = NULL; uint_t c; nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); if (pcs == NULL || pcs->pcs_state == CS_NONE) return (B_FALSE); assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS || pcs->pcs_state == CS_CHECKPOINT_DISCARDING); return (B_TRUE); } return (B_FALSE); } static int scrub_callback(zpool_handle_t *zhp, void *data) { scrub_cbdata_t *cb = data; int err; /* * Ignore faulted pools. */ if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { (void) fprintf(stderr, gettext("cannot scan '%s': pool is " "currently unavailable\n"), zpool_get_name(zhp)); return (1); } err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd); if (err == 0 && zpool_has_checkpoint(zhp) && cb->cb_type == POOL_SCAN_SCRUB) { (void) printf(gettext("warning: will not scrub state that " "belongs to the checkpoint of pool '%s'\n"), zpool_get_name(zhp)); } return (err != 0); } static int wait_callback(zpool_handle_t *zhp, void *data) { zpool_wait_activity_t *act = data; return (zpool_wait(zhp, *act)); } /* * zpool scrub [-s | -p] [-w] [-e] ... * * -e Only scrub blocks in the error log. * -s Stop. Stops any in-progress scrub. * -p Pause. Pause in-progress scrub. * -w Wait. Blocks until scrub has completed. */ int zpool_do_scrub(int argc, char **argv) { int c; scrub_cbdata_t cb; boolean_t wait = B_FALSE; int error; cb.cb_type = POOL_SCAN_SCRUB; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; boolean_t is_error_scrub = B_FALSE; boolean_t is_pause = B_FALSE; boolean_t is_stop = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "spwe")) != -1) { switch (c) { case 'e': is_error_scrub = B_TRUE; break; case 's': is_stop = B_TRUE; break; case 'p': is_pause = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (is_pause && is_stop) { (void) fprintf(stderr, gettext("invalid option " "combination :-s and -p are mutually exclusive\n")); usage(B_FALSE); } else { if (is_error_scrub) cb.cb_type = POOL_SCAN_ERRORSCRUB; if (is_pause) { cb.cb_scrub_cmd = POOL_SCRUB_PAUSE; } else if (is_stop) { cb.cb_type = POOL_SCAN_NONE; } else { cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; } } if (wait && (cb.cb_type == POOL_SCAN_NONE || cb.cb_scrub_cmd == POOL_SCRUB_PAUSE)) { (void) fprintf(stderr, gettext("invalid option combination: " "-w cannot be used with -p or -s\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } error = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, scrub_callback, &cb); if (wait && !error) { zpool_wait_activity_t act = ZPOOL_WAIT_SCRUB; error = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, wait_callback, &act); } return (error); } /* * zpool resilver ... * * Restarts any in-progress resilver */ int zpool_do_resilver(int argc, char **argv) { int c; scrub_cbdata_t cb; cb.cb_type = POOL_SCAN_RESILVER; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } return (for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, scrub_callback, &cb)); } /* * zpool trim [-d] [-r ] [-c | -s] [ ...] * * -c Cancel. Ends any in-progress trim. * -d Secure trim. Requires kernel and device support. * -r Sets the TRIM rate in bytes (per second). Supports * adding a multiplier suffix such as 'k' or 'm'. * -s Suspend. TRIM can then be restarted with no flags. * -w Wait. Blocks until trimming has completed. */ int zpool_do_trim(int argc, char **argv) { struct option long_options[] = { {"cancel", no_argument, NULL, 'c'}, {"secure", no_argument, NULL, 'd'}, {"rate", required_argument, NULL, 'r'}, {"suspend", no_argument, NULL, 's'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; pool_trim_func_t cmd_type = POOL_TRIM_START; uint64_t rate = 0; boolean_t secure = B_FALSE; boolean_t wait = B_FALSE; int c; while ((c = getopt_long(argc, argv, "cdr:sw", long_options, NULL)) != -1) { switch (c) { case 'c': if (cmd_type != POOL_TRIM_START && cmd_type != POOL_TRIM_CANCEL) { (void) fprintf(stderr, gettext("-c cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_TRIM_CANCEL; break; case 'd': if (cmd_type != POOL_TRIM_START) { (void) fprintf(stderr, gettext("-d cannot be " "combined with the -c or -s options\n")); usage(B_FALSE); } secure = B_TRUE; break; case 'r': if (cmd_type != POOL_TRIM_START) { (void) fprintf(stderr, gettext("-r cannot be " "combined with the -c or -s options\n")); usage(B_FALSE); } if (zfs_nicestrtonum(g_zfs, optarg, &rate) == -1) { (void) fprintf(stderr, "%s: %s\n", gettext("invalid value for rate"), libzfs_error_description(g_zfs)); usage(B_FALSE); } break; case 's': if (cmd_type != POOL_TRIM_START && cmd_type != POOL_TRIM_SUSPEND) { (void) fprintf(stderr, gettext("-s cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_TRIM_SUSPEND; break; case 'w': wait = B_TRUE; break; case '?': if (optopt != 0) { (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } else { (void) fprintf(stderr, gettext("invalid option '%s'\n"), argv[optind - 1]); } usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); return (-1); } if (wait && (cmd_type != POOL_TRIM_START)) { (void) fprintf(stderr, gettext("-w cannot be used with -c or " "-s\n")); usage(B_FALSE); } char *poolname = argv[0]; zpool_handle_t *zhp = zpool_open(g_zfs, poolname); if (zhp == NULL) return (-1); trimflags_t trim_flags = { .secure = secure, .rate = rate, .wait = wait, }; nvlist_t *vdevs = fnvlist_alloc(); if (argc == 1) { /* no individual leaf vdevs specified, so add them all */ nvlist_t *config = zpool_get_config(zhp, NULL); nvlist_t *nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); zpool_collect_leaves(zhp, nvroot, vdevs); trim_flags.fullpool = B_TRUE; } else { trim_flags.fullpool = B_FALSE; for (int i = 1; i < argc; i++) { fnvlist_add_boolean(vdevs, argv[i]); } } int error = zpool_trim(zhp, cmd_type, vdevs, &trim_flags); fnvlist_free(vdevs); zpool_close(zhp); return (error); } /* * Converts a total number of seconds to a human readable string broken * down in to days/hours/minutes/seconds. */ static void secs_to_dhms(uint64_t total, char *buf) { uint64_t days = total / 60 / 60 / 24; uint64_t hours = (total / 60 / 60) % 24; uint64_t mins = (total / 60) % 60; uint64_t secs = (total % 60); if (days > 0) { (void) sprintf(buf, "%llu days %02llu:%02llu:%02llu", (u_longlong_t)days, (u_longlong_t)hours, (u_longlong_t)mins, (u_longlong_t)secs); } else { (void) sprintf(buf, "%02llu:%02llu:%02llu", (u_longlong_t)hours, (u_longlong_t)mins, (u_longlong_t)secs); } } /* * Print out detailed error scrub status. */ static void print_err_scrub_status(pool_scan_stat_t *ps) { time_t start, end, pause; uint64_t total_secs_left; uint64_t secs_left, mins_left, hours_left, days_left; uint64_t examined, to_be_examined; if (ps == NULL || ps->pss_error_scrub_func != POOL_SCAN_ERRORSCRUB) { return; } (void) printf(gettext(" scrub: ")); start = ps->pss_error_scrub_start; end = ps->pss_error_scrub_end; pause = ps->pss_pass_error_scrub_pause; examined = ps->pss_error_scrub_examined; to_be_examined = ps->pss_error_scrub_to_be_examined; assert(ps->pss_error_scrub_func == POOL_SCAN_ERRORSCRUB); if (ps->pss_error_scrub_state == DSS_FINISHED) { total_secs_left = end - start; days_left = total_secs_left / 60 / 60 / 24; hours_left = (total_secs_left / 60 / 60) % 24; mins_left = (total_secs_left / 60) % 60; secs_left = (total_secs_left % 60); (void) printf(gettext("scrubbed %llu error blocks in %llu days " "%02llu:%02llu:%02llu on %s"), (u_longlong_t)examined, (u_longlong_t)days_left, (u_longlong_t)hours_left, (u_longlong_t)mins_left, (u_longlong_t)secs_left, ctime(&end)); return; } else if (ps->pss_error_scrub_state == DSS_CANCELED) { (void) printf(gettext("error scrub canceled on %s"), ctime(&end)); return; } assert(ps->pss_error_scrub_state == DSS_ERRORSCRUBBING); /* Error scrub is in progress. */ if (pause == 0) { (void) printf(gettext("error scrub in progress since %s"), ctime(&start)); } else { (void) printf(gettext("error scrub paused since %s"), ctime(&pause)); (void) printf(gettext("\terror scrub started on %s"), ctime(&start)); } double fraction_done = (double)examined / (to_be_examined + examined); (void) printf(gettext("\t%.2f%% done, issued I/O for %llu error" " blocks"), 100 * fraction_done, (u_longlong_t)examined); (void) printf("\n"); } /* * Print out detailed scrub status. */ static void print_scan_scrub_resilver_status(pool_scan_stat_t *ps) { time_t start, end, pause; uint64_t pass_scanned, scanned, pass_issued, issued, total_s, total_i; uint64_t elapsed, scan_rate, issue_rate; double fraction_done; char processed_buf[7], scanned_buf[7], issued_buf[7], total_s_buf[7]; char total_i_buf[7], srate_buf[7], irate_buf[7], time_buf[32]; printf(" "); printf_color(ANSI_BOLD, gettext("scan:")); printf(" "); /* If there's never been a scan, there's not much to say. */ if (ps == NULL || ps->pss_func == POOL_SCAN_NONE || ps->pss_func >= POOL_SCAN_FUNCS) { (void) printf(gettext("none requested\n")); return; } start = ps->pss_start_time; end = ps->pss_end_time; pause = ps->pss_pass_scrub_pause; zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf)); int is_resilver = ps->pss_func == POOL_SCAN_RESILVER; int is_scrub = ps->pss_func == POOL_SCAN_SCRUB; assert(is_resilver || is_scrub); /* Scan is finished or canceled. */ if (ps->pss_state == DSS_FINISHED) { secs_to_dhms(end - start, time_buf); if (is_scrub) { (void) printf(gettext("scrub repaired %s " "in %s with %llu errors on %s"), processed_buf, time_buf, (u_longlong_t)ps->pss_errors, ctime(&end)); } else if (is_resilver) { (void) printf(gettext("resilvered %s " "in %s with %llu errors on %s"), processed_buf, time_buf, (u_longlong_t)ps->pss_errors, ctime(&end)); } return; } else if (ps->pss_state == DSS_CANCELED) { if (is_scrub) { (void) printf(gettext("scrub canceled on %s"), ctime(&end)); } else if (is_resilver) { (void) printf(gettext("resilver canceled on %s"), ctime(&end)); } return; } assert(ps->pss_state == DSS_SCANNING); /* Scan is in progress. Resilvers can't be paused. */ if (is_scrub) { if (pause == 0) { (void) printf(gettext("scrub in progress since %s"), ctime(&start)); } else { (void) printf(gettext("scrub paused since %s"), ctime(&pause)); (void) printf(gettext("\tscrub started on %s"), ctime(&start)); } } else if (is_resilver) { (void) printf(gettext("resilver in progress since %s"), ctime(&start)); } scanned = ps->pss_examined; pass_scanned = ps->pss_pass_exam; issued = ps->pss_issued; pass_issued = ps->pss_pass_issued; total_s = ps->pss_to_examine; total_i = ps->pss_to_examine - ps->pss_skipped; /* we are only done with a block once we have issued the IO for it */ fraction_done = (double)issued / total_i; /* elapsed time for this pass, rounding up to 1 if it's 0 */ elapsed = time(NULL) - ps->pss_pass_start; elapsed -= ps->pss_pass_scrub_spent_paused; elapsed = (elapsed != 0) ? elapsed : 1; scan_rate = pass_scanned / elapsed; issue_rate = pass_issued / elapsed; /* format all of the numbers we will be reporting */ zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf)); zfs_nicebytes(issued, issued_buf, sizeof (issued_buf)); zfs_nicebytes(total_s, total_s_buf, sizeof (total_s_buf)); zfs_nicebytes(total_i, total_i_buf, sizeof (total_i_buf)); /* do not print estimated time if we have a paused scrub */ (void) printf(gettext("\t%s / %s scanned"), scanned_buf, total_s_buf); if (pause == 0 && scan_rate > 0) { zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf)); (void) printf(gettext(" at %s/s"), srate_buf); } (void) printf(gettext(", %s / %s issued"), issued_buf, total_i_buf); if (pause == 0 && issue_rate > 0) { zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf)); (void) printf(gettext(" at %s/s"), irate_buf); } (void) printf(gettext("\n")); if (is_resilver) { (void) printf(gettext("\t%s resilvered, %.2f%% done"), processed_buf, 100 * fraction_done); } else if (is_scrub) { (void) printf(gettext("\t%s repaired, %.2f%% done"), processed_buf, 100 * fraction_done); } if (pause == 0) { /* * Only provide an estimate iff: * 1) we haven't yet issued all we expected, and * 2) the issue rate exceeds 10 MB/s, and * 3) it's either: * a) a resilver which has started repairs, or * b) a scrub which has entered the issue phase. */ if (total_i >= issued && issue_rate >= 10 * 1024 * 1024 && ((is_resilver && ps->pss_processed > 0) || (is_scrub && issued > 0))) { secs_to_dhms((total_i - issued) / issue_rate, time_buf); (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " "completion time\n")); } } else { (void) printf(gettext("\n")); } } static void print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, uint_t c, char *vdev_name) { if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE) return; printf(" "); printf_color(ANSI_BOLD, gettext("scan:")); printf(" "); uint64_t bytes_scanned = vrs->vrs_bytes_scanned; uint64_t bytes_issued = vrs->vrs_bytes_issued; uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt; uint64_t bytes_est_s = vrs->vrs_bytes_est; uint64_t bytes_est_i = vrs->vrs_bytes_est; if (c > offsetof(vdev_rebuild_stat_t, vrs_pass_bytes_skipped) / 8) bytes_est_i -= vrs->vrs_pass_bytes_skipped; uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned / (vrs->vrs_pass_time_ms + 1)) * 1000; uint64_t issue_rate = (vrs->vrs_pass_bytes_issued / (vrs->vrs_pass_time_ms + 1)) * 1000; double scan_pct = MIN((double)bytes_scanned * 100 / (bytes_est_s + 1), 100); /* Format all of the numbers we will be reporting */ char bytes_scanned_buf[7], bytes_issued_buf[7]; char bytes_rebuilt_buf[7], bytes_est_s_buf[7], bytes_est_i_buf[7]; char scan_rate_buf[7], issue_rate_buf[7], time_buf[32]; zfs_nicebytes(bytes_scanned, bytes_scanned_buf, sizeof (bytes_scanned_buf)); zfs_nicebytes(bytes_issued, bytes_issued_buf, sizeof (bytes_issued_buf)); zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf, sizeof (bytes_rebuilt_buf)); zfs_nicebytes(bytes_est_s, bytes_est_s_buf, sizeof (bytes_est_s_buf)); zfs_nicebytes(bytes_est_i, bytes_est_i_buf, sizeof (bytes_est_i_buf)); time_t start = vrs->vrs_start_time; time_t end = vrs->vrs_end_time; /* Rebuild is finished or canceled. */ if (vrs->vrs_state == VDEV_REBUILD_COMPLETE) { secs_to_dhms(vrs->vrs_scan_time_ms / 1000, time_buf); (void) printf(gettext("resilvered (%s) %s in %s " "with %llu errors on %s"), vdev_name, bytes_rebuilt_buf, time_buf, (u_longlong_t)vrs->vrs_errors, ctime(&end)); return; } else if (vrs->vrs_state == VDEV_REBUILD_CANCELED) { (void) printf(gettext("resilver (%s) canceled on %s"), vdev_name, ctime(&end)); return; } else if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { (void) printf(gettext("resilver (%s) in progress since %s"), vdev_name, ctime(&start)); } assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE); (void) printf(gettext("\t%s / %s scanned"), bytes_scanned_buf, bytes_est_s_buf); if (scan_rate > 0) { zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf)); (void) printf(gettext(" at %s/s"), scan_rate_buf); } (void) printf(gettext(", %s / %s issued"), bytes_issued_buf, bytes_est_i_buf); if (issue_rate > 0) { zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf)); (void) printf(gettext(" at %s/s"), issue_rate_buf); } (void) printf(gettext("\n")); (void) printf(gettext("\t%s resilvered, %.2f%% done"), bytes_rebuilt_buf, scan_pct); if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { if (bytes_est_s >= bytes_scanned && scan_rate >= 10 * 1024 * 1024) { secs_to_dhms((bytes_est_s - bytes_scanned) / scan_rate, time_buf); (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " "completion time\n")); } } else { (void) printf(gettext("\n")); } } /* * Print rebuild status for top-level vdevs. */ static void print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot) { nvlist_t **child; uint_t children; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (uint_t c = 0; c < children; c++) { vdev_rebuild_stat_t *vrs; uint_t i; if (nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { char *name = zpool_vdev_name(g_zfs, zhp, child[c], VDEV_NAME_TYPE_ID); print_rebuild_status_impl(vrs, i, name); free(name); } } } /* * As we don't scrub checkpointed blocks, we want to warn the user that we * skipped scanning some blocks if a checkpoint exists or existed at any * time during the scan. If a sequential instead of healing reconstruction * was performed then the blocks were reconstructed. However, their checksums * have not been verified so we still print the warning. */ static void print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs) { if (ps == NULL || pcs == NULL) return; if (pcs->pcs_state == CS_NONE || pcs->pcs_state == CS_CHECKPOINT_DISCARDING) return; assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS); if (ps->pss_state == DSS_NONE) return; if ((ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) && ps->pss_end_time < pcs->pcs_start_time) return; if (ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) { (void) printf(gettext(" scan warning: skipped blocks " "that are only referenced by the checkpoint.\n")); } else { assert(ps->pss_state == DSS_SCANNING); (void) printf(gettext(" scan warning: skipping blocks " "that are only referenced by the checkpoint.\n")); } } /* * Returns B_TRUE if there is an active rebuild in progress. Otherwise, * B_FALSE is returned and 'rebuild_end_time' is set to the end time for * the last completed (or cancelled) rebuild. */ static boolean_t check_rebuilding(nvlist_t *nvroot, uint64_t *rebuild_end_time) { nvlist_t **child; uint_t children; boolean_t rebuilding = B_FALSE; uint64_t end_time = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (uint_t c = 0; c < children; c++) { vdev_rebuild_stat_t *vrs; uint_t i; if (nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { if (vrs->vrs_end_time > end_time) end_time = vrs->vrs_end_time; if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { rebuilding = B_TRUE; end_time = 0; break; } } } if (rebuild_end_time != NULL) *rebuild_end_time = end_time; return (rebuilding); } /* * Print the scan status. */ static void print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot) { uint64_t rebuild_end_time = 0, resilver_end_time = 0; boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE; boolean_t have_errorscrub = B_FALSE; boolean_t active_resilver = B_FALSE; pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *ps = NULL; uint_t c; time_t scrub_start = 0, errorscrub_start = 0; if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c) == 0) { if (ps->pss_func == POOL_SCAN_RESILVER) { resilver_end_time = ps->pss_end_time; active_resilver = (ps->pss_state == DSS_SCANNING); } have_resilver = (ps->pss_func == POOL_SCAN_RESILVER); have_scrub = (ps->pss_func == POOL_SCAN_SCRUB); scrub_start = ps->pss_start_time; if (c > offsetof(pool_scan_stat_t, pss_pass_error_scrub_pause) / 8) { have_errorscrub = (ps->pss_error_scrub_func == POOL_SCAN_ERRORSCRUB); errorscrub_start = ps->pss_error_scrub_start; } } boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time); boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0)); /* Always print the scrub status when available. */ if (have_scrub && scrub_start > errorscrub_start) print_scan_scrub_resilver_status(ps); else if (have_errorscrub && errorscrub_start >= scrub_start) print_err_scrub_status(ps); /* * When there is an active resilver or rebuild print its status. * Otherwise print the status of the last resilver or rebuild. */ if (active_resilver || (!active_rebuild && have_resilver && resilver_end_time && resilver_end_time > rebuild_end_time)) { print_scan_scrub_resilver_status(ps); } else if (active_rebuild || (!active_resilver && have_rebuild && rebuild_end_time && rebuild_end_time > resilver_end_time)) { print_rebuild_status(zhp, nvroot); } (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); print_checkpoint_scan_warning(ps, pcs); } /* * Print out detailed removal status. */ static void print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) { char copied_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; time_t start, end; nvlist_t *config, *nvroot; nvlist_t **child; uint_t children; char *vdev_name; if (prs == NULL || prs->prs_state == DSS_NONE) return; /* * Determine name of vdev. */ config = zpool_get_config(zhp, NULL); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); assert(prs->prs_removing_vdev < children); vdev_name = zpool_vdev_name(g_zfs, zhp, child[prs->prs_removing_vdev], B_TRUE); printf_color(ANSI_BOLD, gettext("remove: ")); start = prs->prs_start_time; end = prs->prs_end_time; zfs_nicenum(prs->prs_copied, copied_buf, sizeof (copied_buf)); /* * Removal is finished or canceled. */ if (prs->prs_state == DSS_FINISHED) { uint64_t minutes_taken = (end - start) / 60; (void) printf(gettext("Removal of vdev %llu copied %s " "in %lluh%um, completed on %s"), (longlong_t)prs->prs_removing_vdev, copied_buf, (u_longlong_t)(minutes_taken / 60), (uint_t)(minutes_taken % 60), ctime((time_t *)&end)); } else if (prs->prs_state == DSS_CANCELED) { (void) printf(gettext("Removal of %s canceled on %s"), vdev_name, ctime(&end)); } else { uint64_t copied, total, elapsed, mins_left, hours_left; double fraction_done; uint_t rate; assert(prs->prs_state == DSS_SCANNING); /* * Removal is in progress. */ (void) printf(gettext( "Evacuation of %s in progress since %s"), vdev_name, ctime(&start)); copied = prs->prs_copied > 0 ? prs->prs_copied : 1; total = prs->prs_to_copy; fraction_done = (double)copied / total; /* elapsed time for this pass */ elapsed = time(NULL) - prs->prs_start_time; elapsed = elapsed > 0 ? elapsed : 1; rate = copied / elapsed; rate = rate > 0 ? rate : 1; mins_left = ((total - copied) / rate) / 60; hours_left = mins_left / 60; zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); zfs_nicenum(total, total_buf, sizeof (total_buf)); zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); /* * do not print estimated time if hours_left is more than * 30 days */ (void) printf(gettext( "\t%s copied out of %s at %s/s, %.2f%% done"), examined_buf, total_buf, rate_buf, 100 * fraction_done); if (hours_left < (30 * 24)) { (void) printf(gettext(", %lluh%um to go\n"), (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); } else { (void) printf(gettext( ", (copy is slow, no estimated time)\n")); } } free(vdev_name); if (prs->prs_mapping_memory > 0) { char mem_buf[7]; zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf)); (void) printf(gettext( "\t%s memory used for removed device mappings\n"), mem_buf); } } static void print_checkpoint_status(pool_checkpoint_stat_t *pcs) { time_t start; char space_buf[7]; if (pcs == NULL || pcs->pcs_state == CS_NONE) return; (void) printf(gettext("checkpoint: ")); start = pcs->pcs_start_time; zfs_nicenum(pcs->pcs_space, space_buf, sizeof (space_buf)); if (pcs->pcs_state == CS_CHECKPOINT_EXISTS) { char *date = ctime(&start); /* * ctime() adds a newline at the end of the generated * string, thus the weird format specifier and the * strlen() call used to chop it off from the output. */ (void) printf(gettext("created %.*s, consumes %s\n"), (int)(strlen(date) - 1), date, space_buf); return; } assert(pcs->pcs_state == CS_CHECKPOINT_DISCARDING); (void) printf(gettext("discarding, %s remaining.\n"), space_buf); } static void print_error_log(zpool_handle_t *zhp) { nvlist_t *nverrlist = NULL; nvpair_t *elem; char *pathname; size_t len = MAXPATHLEN * 2; if (zpool_get_errlog(zhp, &nverrlist) != 0) return; (void) printf("errors: Permanent errors have been " "detected in the following files:\n\n"); pathname = safe_malloc(len); elem = NULL; while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { nvlist_t *nv; uint64_t dsobj, obj; verify(nvpair_value_nvlist(elem, &nv) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET, &dsobj) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT, &obj) == 0); zpool_obj_to_path(zhp, dsobj, obj, pathname, len); (void) printf("%7s %s\n", "", pathname); } free(pathname); nvlist_free(nverrlist); } static void print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares, uint_t nspares) { uint_t i; char *name; if (nspares == 0) return; (void) printf(gettext("\tspares\n")); for (i = 0; i < nspares; i++) { name = zpool_vdev_name(g_zfs, zhp, spares[i], cb->cb_name_flags); print_status_config(zhp, cb, name, spares[i], 2, B_TRUE, NULL); free(name); } } static void print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache, uint_t nl2cache) { uint_t i; char *name; if (nl2cache == 0) return; (void) printf(gettext("\tcache\n")); for (i = 0; i < nl2cache; i++) { name = zpool_vdev_name(g_zfs, zhp, l2cache[i], cb->cb_name_flags); print_status_config(zhp, cb, name, l2cache[i], 2, B_FALSE, NULL); free(name); } } static void print_dedup_stats(nvlist_t *config) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; uint_t c; char dspace[6], mspace[6]; /* * If the pool was faulted then we may not have been able to * obtain the config. Otherwise, if we have anything in the dedup * table continue processing the stats. */ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, (uint64_t **)&ddo, &c) != 0) return; (void) printf("\n"); (void) printf(gettext(" dedup: ")); if (ddo->ddo_count == 0) { (void) printf(gettext("no DDT entries\n")); return; } zfs_nicebytes(ddo->ddo_dspace, dspace, sizeof (dspace)); zfs_nicebytes(ddo->ddo_mspace, mspace, sizeof (mspace)); (void) printf("DDT entries %llu, size %s on disk, %s in core\n", (u_longlong_t)ddo->ddo_count, dspace, mspace); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t **)&dds, &c) == 0); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, (uint64_t **)&ddh, &c) == 0); zpool_dump_ddt(dds, ddh); } /* * Display a summary of pool status. Displays a summary such as: * * pool: tank * status: DEGRADED * reason: One or more devices ... * see: https://openzfs.github.io/openzfs-docs/msg/ZFS-xxxx-01 * config: * mirror DEGRADED * c1t0d0 OK * c2t0d0 UNAVAIL * * When given the '-v' option, we print out the complete config. If the '-e' * option is specified, then we print out error rate information as well. */ static int status_callback(zpool_handle_t *zhp, void *data) { status_cbdata_t *cbp = data; nvlist_t *config, *nvroot; const char *msgid; zpool_status_t reason; zpool_errata_t errata; const char *health; uint_t c; vdev_stat_t *vs; config = zpool_get_config(zhp, NULL); reason = zpool_get_status(zhp, &msgid, &errata); cbp->cb_count++; /* * If we were given 'zpool status -x', only report those pools with * problems. */ if (cbp->cb_explain && (reason == ZPOOL_STATUS_OK || reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED || reason == ZPOOL_STATUS_COMPATIBILITY_ERR || reason == ZPOOL_STATUS_INCOMPATIBLE_FEAT)) { if (!cbp->cb_allpools) { (void) printf(gettext("pool '%s' is healthy\n"), zpool_get_name(zhp)); if (cbp->cb_first) cbp->cb_first = B_FALSE; } return (0); } if (cbp->cb_first) cbp->cb_first = B_FALSE; else (void) printf("\n"); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); health = zpool_get_state_str(zhp); printf(" "); printf_color(ANSI_BOLD, gettext("pool:")); printf(" %s\n", zpool_get_name(zhp)); fputc(' ', stdout); printf_color(ANSI_BOLD, gettext("state: ")); printf_color(health_str_to_color(health), "%s", health); fputc('\n', stdout); switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be opened. Sufficient replicas exist for\n\tthe pool " "to continue functioning in a degraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Attach the missing device " "and online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_MISSING_DEV_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be opened. There are insufficient\n\treplicas for the" " pool to continue functioning.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Attach the missing device " "and online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be used because the label is missing or\n\tinvalid. " "Sufficient replicas exist for the pool to continue\n\t" "functioning in a degraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Replace the device using " "'zpool replace'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be used because the label is missing \n\tor invalid. " "There are insufficient replicas for the pool to " "continue\n\tfunctioning.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_FAILING_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "experienced an unrecoverable error. An\n\tattempt was " "made to correct the error. Applications are " "unaffected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Determine if the " "device needs to be replaced, and clear the errors\n\tusing" " 'zpool clear' or replace the device with 'zpool " "replace'.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "been taken offline by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Online the device " "using 'zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_REMOVED_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "been removed by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Online the device " "using zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_RESILVERING: case ZPOOL_STATUS_REBUILDING: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices is " "currently being resilvered. The pool will\n\tcontinue " "to function, possibly in a degraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Wait for the resilver to " "complete.\n")); break; case ZPOOL_STATUS_REBUILD_SCRUB: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices have " "been sequentially resilvered, scrubbing\n\tthe pool " "is recommended.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Use 'zpool scrub' to " "verify all data checksums.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "experienced an error resulting in data\n\tcorruption. " "Applications may be affected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Restore the file in question" " if possible. Otherwise restore the\n\tentire pool from " "backup.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool metadata is " "corrupted and the pool cannot be opened.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_VERSION_OLDER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "a legacy on-disk format. The pool can\n\tstill be used, " "but some features are unavailable.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Upgrade the pool using " "'zpool upgrade'. Once this is done, the\n\tpool will no " "longer be accessible on software that does not support\n\t" "feature flags.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool has been upgraded " "to a newer, incompatible on-disk version.\n\tThe pool " "cannot be accessed on this system.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Access the pool from a " "system running more recent software, or\n\trestore the " "pool from backup.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Some supported and " "requested features are not enabled on the pool.\n\t" "The pool can still be used, but some features are " "unavailable.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Enable all features using " "'zpool upgrade'. Once this is done,\n\tthe pool may no " "longer be accessible by software that does not support\n\t" "the features. See zpool-features(7) for details.\n")); break; case ZPOOL_STATUS_COMPATIBILITY_ERR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("This pool has a " "compatibility list specified, but it could not be\n\t" "read/parsed at this time. The pool can still be used, " "but this\n\tshould be investigated.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Check the value of the " "'compatibility' property against the\n\t" "appropriate file in " ZPOOL_SYSCONF_COMPAT_D " or " ZPOOL_DATA_COMPAT_D ".\n")); break; case ZPOOL_STATUS_INCOMPATIBLE_FEAT: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more features " "are enabled on the pool despite not being\n\t" "requested by the 'compatibility' property.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Consider setting " "'compatibility' to an appropriate value, or\n\t" "adding needed features to the relevant file in\n\t" ZPOOL_SYSCONF_COMPAT_D " or " ZPOOL_DATA_COMPAT_D ".\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed " "on this system because it uses the\n\tfollowing feature(s)" " not supported on this system:\n")); zpool_print_unsup_feat(config); (void) printf("\n"); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Access the pool from a " "system that supports the required feature(s),\n\tor " "restore the pool from backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool can only be " "accessed in read-only mode on this system. It\n\tcannot be" " accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); zpool_print_unsup_feat(config); (void) printf("\n"); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed " "in read-write mode. Import the pool with\n" "\t\"-o readonly=on\", access the pool from a system that " "supports the\n\trequired feature(s), or restore the " "pool from backup.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted in response to persistent errors.\n\tSufficient " "replicas exist for the pool to continue functioning " "in a\n\tdegraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Replace the faulted device, " "or use 'zpool clear' to mark the device\n\trepaired.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted in response to persistent errors. There are " "insufficient replicas for the pool to\n\tcontinue " "functioning.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Destroy and re-create the " "pool from a backup source. Manually marking the device\n" "\trepaired using 'zpool clear' may allow some data " "to be recovered.\n")); break; case ZPOOL_STATUS_IO_FAILURE_MMP: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is suspended " "because multihost writes failed or were delayed;\n\t" "another system could import the pool undetected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices" " are connected, then reboot your system and\n\timport the " "pool.\n")); break; case ZPOOL_STATUS_IO_FAILURE_WAIT: case ZPOOL_STATUS_IO_FAILURE_CONTINUE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted in response to IO failures.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Make sure the affected " "devices are connected, then run 'zpool clear'.\n")); break; case ZPOOL_STATUS_BAD_LOG: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("An intent log record " "could not be read.\n" "\tWaiting for administrator intervention to fix the " "faulted pool.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Either restore the affected " "device(s) and run 'zpool online',\n" "\tor ignore the intent log records by running " "'zpool clear'.\n")); break; case ZPOOL_STATUS_NON_NATIVE_ASHIFT: (void) printf(gettext("status: One or more devices are " "configured to use a non-native block size.\n" "\tExpect reduced performance.\n")); (void) printf(gettext("action: Replace affected devices with " "devices that support the\n\tconfigured block size, or " "migrate data to a properly configured\n\tpool.\n")); break; case ZPOOL_STATUS_HOSTID_MISMATCH: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Mismatch between pool hostid" " and system hostid on imported pool.\n\tThis pool was " "previously imported into a system with a different " "hostid,\n\tand then was verbatim imported into this " "system.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Export this pool on all " "systems on which it is imported.\n" "\tThen import it to correct the mismatch.\n")); break; case ZPOOL_STATUS_ERRATA: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), errata); switch (errata) { case ZPOOL_ERRATA_NONE: break; case ZPOOL_ERRATA_ZOL_2094_SCRUB: printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("To correct the issue" " run 'zpool scrub'.\n")); break; case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: (void) printf(gettext("\tExisting encrypted datasets " "contain an on-disk incompatibility\n\twhich " "needs to be corrected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("To correct the issue" " backup existing encrypted datasets to new\n\t" "encrypted datasets and destroy the old ones. " "'zfs mount -o ro' can\n\tbe used to temporarily " "mount existing encrypted datasets readonly.\n")); break; case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION: (void) printf(gettext("\tExisting encrypted snapshots " "and bookmarks contain an on-disk\n\tincompat" "ibility. This may cause on-disk corruption if " "they are used\n\twith 'zfs recv'.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("To correct the" "issue, enable the bookmark_v2 feature. No " "additional\n\taction is needed if there are no " "encrypted snapshots or bookmarks.\n\tIf preserving" "the encrypted snapshots and bookmarks is required," " use\n\ta non-raw send to backup and restore them." " Alternately, they may be\n\tremoved to resolve " "the incompatibility.\n")); break; default: /* * All errata which allow the pool to be imported * must contain an action message. */ assert(0); } break; default: /* * The remaining errors can't actually be generated, yet. */ assert(reason == ZPOOL_STATUS_OK); } if (msgid != NULL) { printf(" "); printf_color(ANSI_BOLD, gettext("see:")); printf(gettext( " https://openzfs.github.io/openzfs-docs/msg/%s\n"), msgid); } if (config != NULL) { uint64_t nerr; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; pool_checkpoint_stat_t *pcs = NULL; pool_removal_stat_t *prs = NULL; print_scan_status(zhp, nvroot); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); print_removal_status(zhp, prs); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); print_checkpoint_status(pcs); cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, cbp->cb_name_flags | VDEV_NAME_TYPE_ID); if (cbp->cb_namewidth < 10) cbp->cb_namewidth = 10; color_start(ANSI_BOLD); (void) printf(gettext("config:\n\n")); (void) printf(gettext("\t%-*s %-8s %5s %5s %5s"), cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE", "CKSUM"); color_end(); if (cbp->cb_print_slow_ios) { printf_color(ANSI_BOLD, " %5s", gettext("SLOW")); } if (cbp->vcdl != NULL) print_cmd_columns(cbp->vcdl, 0); printf("\n"); print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0, B_FALSE, NULL); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) print_l2cache(zhp, cbp, l2cache, nl2cache); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) print_spares(zhp, cbp, spares, nspares); if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, &nerr) == 0) { (void) printf("\n"); if (nerr == 0) { (void) printf(gettext( "errors: No known data errors\n")); } else if (!cbp->cb_verbose) { (void) printf(gettext("errors: %llu data " "errors, use '-v' for a list\n"), (u_longlong_t)nerr); } else { print_error_log(zhp); } } if (cbp->cb_dedup_stats) print_dedup_stats(config); } else { (void) printf(gettext("config: The configuration cannot be " "determined.\n")); } return (0); } /* * zpool status [-c [script1,script2,...]] [-igLpPstvx] [-T d|u] [pool] ... * [interval [count]] * * -c CMD For each vdev, run command CMD * -i Display vdev initialization status. * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -p Display values in parsable (exact) format. * -P Display full path for vdev name. * -s Display slow IOs column. * -v Display complete error logs * -x Display only pools with potential problems * -D Display dedup status (undocumented) * -t Display vdev TRIM status. * -T Display a timestamp in date(1) or Unix format * * Describes the health status of all pools or some subset. */ int zpool_do_status(int argc, char **argv) { int c; int ret; float interval = 0; unsigned long count = 0; status_cbdata_t cb = { 0 }; char *cmd = NULL; /* check options */ while ((c = getopt(argc, argv, "c:igLpPsvxDtT:")) != -1) { switch (c) { case 'c': if (cmd != NULL) { fprintf(stderr, gettext("Can't set -c flag twice\n")); exit(1); } if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) { fprintf(stderr, gettext( "Can't run -c, disabled by " "ZPOOL_SCRIPTS_ENABLED.\n")); exit(1); } if ((getuid() <= 0 || geteuid() <= 0) && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) { fprintf(stderr, gettext( "Can't run -c with root privileges " "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n")); exit(1); } cmd = optarg; break; case 'i': cb.cb_print_vdev_init = B_TRUE; break; case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'p': cb.cb_literal = B_TRUE; break; case 'P': cb.cb_name_flags |= VDEV_NAME_PATH; break; case 's': cb.cb_print_slow_ios = B_TRUE; break; case 'v': cb.cb_verbose = B_TRUE; break; case 'x': cb.cb_explain = B_TRUE; break; case 'D': cb.cb_dedup_stats = B_TRUE; break; case 't': cb.cb_print_vdev_trim = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case '?': if (optopt == 'c') { print_zpool_script_list("status"); exit(0); } else { fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &interval, &count); if (argc == 0) cb.cb_allpools = B_TRUE; cb.cb_first = B_TRUE; cb.cb_print_status = B_TRUE; for (;;) { if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (cmd != NULL) cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, NULL, NULL, 0, 0); ret = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, cb.cb_literal, status_callback, &cb); if (cb.vcdl != NULL) free_vdev_cmd_data_list(cb.vcdl); if (argc == 0 && cb.cb_count == 0) (void) fprintf(stderr, gettext("no pools available\n")); else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) (void) printf(gettext("all pools are healthy\n")); if (ret != 0) return (ret); if (interval == 0) break; if (count != 0 && --count == 0) break; (void) fsleep(interval); } return (0); } typedef struct upgrade_cbdata { int cb_first; int cb_argc; uint64_t cb_version; char **cb_argv; } upgrade_cbdata_t; static int check_unsupp_fs(zfs_handle_t *zhp, void *unsupp_fs) { int zfs_version = (int)zfs_prop_get_int(zhp, ZFS_PROP_VERSION); int *count = (int *)unsupp_fs; if (zfs_version > ZPL_VERSION) { (void) printf(gettext("%s (v%d) is not supported by this " "implementation of ZFS.\n"), zfs_get_name(zhp), zfs_version); (*count)++; } zfs_iter_filesystems_v2(zhp, 0, check_unsupp_fs, unsupp_fs); zfs_close(zhp); return (0); } static int upgrade_version(zpool_handle_t *zhp, uint64_t version) { int ret; nvlist_t *config; uint64_t oldversion; int unsupp_fs = 0; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &oldversion) == 0); char compat[ZFS_MAXPROPLEN]; if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compat, ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) compat[0] = '\0'; assert(SPA_VERSION_IS_SUPPORTED(oldversion)); assert(oldversion < version); ret = zfs_iter_root(zpool_get_handle(zhp), check_unsupp_fs, &unsupp_fs); if (ret != 0) return (ret); if (unsupp_fs) { (void) fprintf(stderr, gettext("Upgrade not performed due " "to %d unsupported filesystems (max v%d).\n"), unsupp_fs, (int)ZPL_VERSION); return (1); } if (strcmp(compat, ZPOOL_COMPAT_LEGACY) == 0) { (void) fprintf(stderr, gettext("Upgrade not performed because " "'compatibility' property set to '" ZPOOL_COMPAT_LEGACY "'.\n")); return (1); } ret = zpool_upgrade(zhp, version); if (ret != 0) return (ret); if (version >= SPA_VERSION_FEATURES) { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to feature flags.\n"), zpool_get_name(zhp), (u_longlong_t)oldversion); } else { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to version %llu.\n"), zpool_get_name(zhp), (u_longlong_t)oldversion, (u_longlong_t)version); } return (0); } static int upgrade_enable_all(zpool_handle_t *zhp, int *countp) { int i, ret, count; boolean_t firstff = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); char compat[ZFS_MAXPROPLEN]; if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compat, ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) compat[0] = '\0'; boolean_t requested_features[SPA_FEATURES]; if (zpool_do_load_compat(compat, requested_features) != ZPOOL_COMPATIBILITY_OK) return (-1); count = 0; for (i = 0; i < SPA_FEATURES; i++) { const char *fname = spa_feature_table[i].fi_uname; const char *fguid = spa_feature_table[i].fi_guid; if (!spa_feature_table[i].fi_zfs_mod_supported) continue; if (!nvlist_exists(enabled, fguid) && requested_features[i]) { char *propname; verify(-1 != asprintf(&propname, "feature@%s", fname)); ret = zpool_set_prop(zhp, propname, ZFS_FEATURE_ENABLED); if (ret != 0) { free(propname); return (ret); } count++; if (firstff) { (void) printf(gettext("Enabled the " "following features on '%s':\n"), zpool_get_name(zhp)); firstff = B_FALSE; } (void) printf(gettext(" %s\n"), fname); free(propname); } } if (countp != NULL) *countp = count; return (0); } static int upgrade_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; boolean_t modified_pool = B_FALSE; int ret; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); assert(SPA_VERSION_IS_SUPPORTED(version)); if (version < cbp->cb_version) { cbp->cb_first = B_FALSE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); modified_pool = B_TRUE; /* * If they did "zpool upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (cbp->cb_version >= SPA_VERSION_FEATURES) { int count; ret = upgrade_enable_all(zhp, &count); if (ret != 0) return (ret); if (count > 0) { cbp->cb_first = B_FALSE; modified_pool = B_TRUE; } } if (modified_pool) { (void) printf("\n"); (void) after_zpool_upgrade(zhp); } return (0); } static int upgrade_list_older_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); assert(SPA_VERSION_IS_SUPPORTED(version)); if (version < SPA_VERSION_FEATURES) { if (cbp->cb_first) { (void) printf(gettext("The following pools are " "formatted with legacy version numbers and can\n" "be upgraded to use feature flags. After " "being upgraded, these pools\nwill no " "longer be accessible by software that does not " "support feature\nflags.\n\n" "Note that setting a pool's 'compatibility' " "feature to '" ZPOOL_COMPAT_LEGACY "' will\n" "inhibit upgrades.\n\n")); (void) printf(gettext("VER POOL\n")); (void) printf(gettext("--- ------------\n")); cbp->cb_first = B_FALSE; } (void) printf("%2llu %s\n", (u_longlong_t)version, zpool_get_name(zhp)); } return (0); } static int upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); if (version >= SPA_VERSION_FEATURES) { int i; boolean_t poolfirst = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); for (i = 0; i < SPA_FEATURES; i++) { const char *fguid = spa_feature_table[i].fi_guid; const char *fname = spa_feature_table[i].fi_uname; if (!spa_feature_table[i].fi_zfs_mod_supported) continue; if (!nvlist_exists(enabled, fguid)) { if (cbp->cb_first) { (void) printf(gettext("\nSome " "supported features are not " "enabled on the following pools. " "Once a\nfeature is enabled the " "pool may become incompatible with " "software\nthat does not support " "the feature. See " "zpool-features(7) for " "details.\n\n" "Note that the pool " "'compatibility' feature can be " "used to inhibit\nfeature " "upgrades.\n\n")); (void) printf(gettext("POOL " "FEATURE\n")); (void) printf(gettext("------" "---------\n")); cbp->cb_first = B_FALSE; } if (poolfirst) { (void) printf(gettext("%s\n"), zpool_get_name(zhp)); poolfirst = B_FALSE; } (void) printf(gettext(" %s\n"), fname); } /* * If they did "zpool upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } } return (0); } static int upgrade_one(zpool_handle_t *zhp, void *data) { boolean_t modified_pool = B_FALSE; upgrade_cbdata_t *cbp = data; uint64_t cur_version; int ret; if (strcmp("log", zpool_get_name(zhp)) == 0) { (void) fprintf(stderr, gettext("'log' is now a reserved word\n" "Pool 'log' must be renamed using export and import" " to upgrade.\n")); return (1); } cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (cur_version > cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using more current version '%llu'.\n\n"), zpool_get_name(zhp), (u_longlong_t)cur_version); return (0); } if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using version %llu.\n\n"), zpool_get_name(zhp), (u_longlong_t)cbp->cb_version); return (0); } if (cur_version != cbp->cb_version) { modified_pool = B_TRUE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); } if (cbp->cb_version >= SPA_VERSION_FEATURES) { int count = 0; ret = upgrade_enable_all(zhp, &count); if (ret != 0) return (ret); if (count != 0) { modified_pool = B_TRUE; } else if (cur_version == SPA_VERSION) { (void) printf(gettext("Pool '%s' already has all " "supported and requested features enabled.\n"), zpool_get_name(zhp)); } } if (modified_pool) { (void) printf("\n"); (void) after_zpool_upgrade(zhp); } return (0); } /* * zpool upgrade * zpool upgrade -v * zpool upgrade [-V version] <-a | pool ...> * * With no arguments, display downrev'd ZFS pool available for upgrade. * Individual pools can be upgraded by specifying the pool, and '-a' will * upgrade all pools. */ int zpool_do_upgrade(int argc, char **argv) { int c; upgrade_cbdata_t cb = { 0 }; int ret = 0; boolean_t showversions = B_FALSE; boolean_t upgradeall = B_FALSE; char *end; /* check options */ while ((c = getopt(argc, argv, ":avV:")) != -1) { switch (c) { case 'a': upgradeall = B_TRUE; break; case 'v': showversions = B_TRUE; break; case 'V': cb.cb_version = strtoll(optarg, &end, 10); if (*end != '\0' || !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) { (void) fprintf(stderr, gettext("invalid version '%s'\n"), optarg); usage(B_FALSE); } break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } cb.cb_argc = argc; cb.cb_argv = argv; argc -= optind; argv += optind; if (cb.cb_version == 0) { cb.cb_version = SPA_VERSION; } else if (!upgradeall && argc == 0) { (void) fprintf(stderr, gettext("-V option is " "incompatible with other arguments\n")); usage(B_FALSE); } if (showversions) { if (upgradeall || argc != 0) { (void) fprintf(stderr, gettext("-v option is " "incompatible with other arguments\n")); usage(B_FALSE); } } else if (upgradeall) { if (argc != 0) { (void) fprintf(stderr, gettext("-a option should not " "be used along with a pool name\n")); usage(B_FALSE); } } (void) printf("%s", gettext("This system supports ZFS pool feature " "flags.\n\n")); if (showversions) { int i; (void) printf(gettext("The following features are " "supported:\n\n")); (void) printf(gettext("FEAT DESCRIPTION\n")); (void) printf("----------------------------------------------" "---------------\n"); for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *fi = &spa_feature_table[i]; if (!fi->fi_zfs_mod_supported) continue; const char *ro = (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? " (read-only compatible)" : ""; (void) printf("%-37s%s\n", fi->fi_uname, ro); (void) printf(" %s\n", fi->fi_desc); } (void) printf("\n"); (void) printf(gettext("The following legacy versions are also " "supported:\n\n")); (void) printf(gettext("VER DESCRIPTION\n")); (void) printf("--- -----------------------------------------" "---------------\n"); (void) printf(gettext(" 1 Initial ZFS version\n")); (void) printf(gettext(" 2 Ditto blocks " "(replicated metadata)\n")); (void) printf(gettext(" 3 Hot spares and double parity " "RAID-Z\n")); (void) printf(gettext(" 4 zpool history\n")); (void) printf(gettext(" 5 Compression using the gzip " "algorithm\n")); (void) printf(gettext(" 6 bootfs pool property\n")); (void) printf(gettext(" 7 Separate intent log devices\n")); (void) printf(gettext(" 8 Delegated administration\n")); (void) printf(gettext(" 9 refquota and refreservation " "properties\n")); (void) printf(gettext(" 10 Cache devices\n")); (void) printf(gettext(" 11 Improved scrub performance\n")); (void) printf(gettext(" 12 Snapshot properties\n")); (void) printf(gettext(" 13 snapused property\n")); (void) printf(gettext(" 14 passthrough-x aclinherit\n")); (void) printf(gettext(" 15 user/group space accounting\n")); (void) printf(gettext(" 16 stmf property support\n")); (void) printf(gettext(" 17 Triple-parity RAID-Z\n")); (void) printf(gettext(" 18 Snapshot user holds\n")); (void) printf(gettext(" 19 Log device removal\n")); (void) printf(gettext(" 20 Compression using zle " "(zero-length encoding)\n")); (void) printf(gettext(" 21 Deduplication\n")); (void) printf(gettext(" 22 Received properties\n")); (void) printf(gettext(" 23 Slim ZIL\n")); (void) printf(gettext(" 24 System attributes\n")); (void) printf(gettext(" 25 Improved scrub stats\n")); (void) printf(gettext(" 26 Improved snapshot deletion " "performance\n")); (void) printf(gettext(" 27 Improved snapshot creation " "performance\n")); (void) printf(gettext(" 28 Multiple vdev replacements\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases,\n")); (void) printf(gettext("see the ZFS Administration Guide.\n\n")); } else if (argc == 0 && upgradeall) { cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_cb, &cb); if (ret == 0 && cb.cb_first) { if (cb.cb_version == SPA_VERSION) { (void) printf(gettext("All pools are already " "formatted using feature flags.\n\n")); (void) printf(gettext("Every feature flags " "pool already has all supported and " "requested features enabled.\n")); } else { (void) printf(gettext("All pools are already " "formatted with version %llu or higher.\n"), (u_longlong_t)cb.cb_version); } } } else if (argc == 0) { cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb); assert(ret == 0); if (cb.cb_first) { (void) printf(gettext("All pools are formatted " "using feature flags.\n\n")); } else { (void) printf(gettext("\nUse 'zpool upgrade -v' " "for a list of available legacy versions.\n")); } cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb); assert(ret == 0); if (cb.cb_first) { (void) printf(gettext("Every feature flags pool has " "all supported and requested features enabled.\n")); } else { (void) printf(gettext("\n")); } } else { ret = for_each_pool(argc, argv, B_FALSE, NULL, ZFS_TYPE_POOL, B_FALSE, upgrade_one, &cb); } return (ret); } typedef struct hist_cbdata { boolean_t first; boolean_t longfmt; boolean_t internal; } hist_cbdata_t; static void print_history_records(nvlist_t *nvhis, hist_cbdata_t *cb) { nvlist_t **records; uint_t numrecords; int i; verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD, &records, &numrecords) == 0); for (i = 0; i < numrecords; i++) { nvlist_t *rec = records[i]; char tbuf[64] = ""; if (nvlist_exists(rec, ZPOOL_HIST_TIME)) { time_t tsec; struct tm t; tsec = fnvlist_lookup_uint64(records[i], ZPOOL_HIST_TIME); (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); } if (nvlist_exists(rec, ZPOOL_HIST_ELAPSED_NS)) { uint64_t elapsed_ns = fnvlist_lookup_int64(records[i], ZPOOL_HIST_ELAPSED_NS); (void) snprintf(tbuf + strlen(tbuf), sizeof (tbuf) - strlen(tbuf), " (%lldms)", (long long)elapsed_ns / 1000 / 1000); } if (nvlist_exists(rec, ZPOOL_HIST_CMD)) { (void) printf("%s %s", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_CMD)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) { int ievent = fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT); if (!cb->internal) continue; if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) { (void) printf("%s unrecognized record:\n", tbuf); dump_nvlist(rec, 4); continue; } (void) printf("%s [internal %s txg:%lld] %s", tbuf, zfs_history_event_names[ievent], (longlong_t)fnvlist_lookup_uint64( rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) { if (!cb->internal) continue; (void) printf("%s [txg:%lld] %s", tbuf, (longlong_t)fnvlist_lookup_uint64( rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME)); if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) { (void) printf(" %s (%llu)", fnvlist_lookup_string(rec, ZPOOL_HIST_DSNAME), (u_longlong_t)fnvlist_lookup_uint64(rec, ZPOOL_HIST_DSID)); } (void) printf(" %s", fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) { if (!cb->internal) continue; (void) printf("%s ioctl %s\n", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL)); if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) { (void) printf(" input:\n"); dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_INPUT_NVL), 8); } if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) { (void) printf(" output:\n"); dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_OUTPUT_NVL), 8); } if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_SIZE)) { (void) printf(" output nvlist omitted; " "original size: %lldKB\n", (longlong_t)fnvlist_lookup_int64(rec, ZPOOL_HIST_OUTPUT_SIZE) / 1024); } if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) { (void) printf(" errno: %lld\n", (longlong_t)fnvlist_lookup_int64(rec, ZPOOL_HIST_ERRNO)); } } else { if (!cb->internal) continue; (void) printf("%s unrecognized record:\n", tbuf); dump_nvlist(rec, 4); } if (!cb->longfmt) { (void) printf("\n"); continue; } (void) printf(" ["); if (nvlist_exists(rec, ZPOOL_HIST_WHO)) { uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO); struct passwd *pwd = getpwuid(who); (void) printf("user %d ", (int)who); if (pwd != NULL) (void) printf("(%s) ", pwd->pw_name); } if (nvlist_exists(rec, ZPOOL_HIST_HOST)) { (void) printf("on %s", fnvlist_lookup_string(rec, ZPOOL_HIST_HOST)); } if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) { (void) printf(":%s", fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE)); } (void) printf("]"); (void) printf("\n"); } } /* * Print out the command history for a specific pool. */ static int get_history_one(zpool_handle_t *zhp, void *data) { nvlist_t *nvhis; int ret; hist_cbdata_t *cb = (hist_cbdata_t *)data; uint64_t off = 0; boolean_t eof = B_FALSE; cb->first = B_FALSE; (void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp)); while (!eof) { if ((ret = zpool_get_history(zhp, &nvhis, &off, &eof)) != 0) return (ret); print_history_records(nvhis, cb); nvlist_free(nvhis); } (void) printf("\n"); return (ret); } /* * zpool history * * Displays the history of commands that modified pools. */ int zpool_do_history(int argc, char **argv) { hist_cbdata_t cbdata = { 0 }; int ret; int c; cbdata.first = B_TRUE; /* check options */ while ((c = getopt(argc, argv, "li")) != -1) { switch (c) { case 'l': cbdata.longfmt = B_TRUE; break; case 'i': cbdata.internal = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; ret = for_each_pool(argc, argv, B_FALSE, NULL, ZFS_TYPE_POOL, B_FALSE, get_history_one, &cbdata); if (argc == 0 && cbdata.first == B_TRUE) { (void) fprintf(stderr, gettext("no pools available\n")); return (0); } return (ret); } typedef struct ev_opts { int verbose; int scripted; int follow; int clear; char poolname[ZFS_MAX_DATASET_NAME_LEN]; } ev_opts_t; static void zpool_do_events_short(nvlist_t *nvl, ev_opts_t *opts) { char ctime_str[26], str[32]; const char *ptr; int64_t *tv; uint_t n; verify(nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0); memset(str, ' ', 32); (void) ctime_r((const time_t *)&tv[0], ctime_str); (void) memcpy(str, ctime_str+4, 6); /* 'Jun 30' */ (void) memcpy(str+7, ctime_str+20, 4); /* '1993' */ (void) memcpy(str+12, ctime_str+11, 8); /* '21:49:08' */ (void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]); /* '.123456789' */ if (opts->scripted) (void) printf(gettext("%s\t"), str); else (void) printf(gettext("%s "), str); verify(nvlist_lookup_string(nvl, FM_CLASS, &ptr) == 0); (void) printf(gettext("%s\n"), ptr); } static void zpool_do_events_nvprint(nvlist_t *nvl, int depth) { nvpair_t *nvp; for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { data_type_t type = nvpair_type(nvp); const char *name = nvpair_name(nvp); boolean_t b; uint8_t i8; uint16_t i16; uint32_t i32; uint64_t i64; const char *str; nvlist_t *cnv; printf(gettext("%*s%s = "), depth, "", name); switch (type) { case DATA_TYPE_BOOLEAN: printf(gettext("%s"), "1"); break; case DATA_TYPE_BOOLEAN_VALUE: (void) nvpair_value_boolean_value(nvp, &b); printf(gettext("%s"), b ? "1" : "0"); break; case DATA_TYPE_BYTE: (void) nvpair_value_byte(nvp, &i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_INT8: (void) nvpair_value_int8(nvp, (void *)&i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_UINT8: (void) nvpair_value_uint8(nvp, &i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_INT16: (void) nvpair_value_int16(nvp, (void *)&i16); printf(gettext("0x%x"), i16); break; case DATA_TYPE_UINT16: (void) nvpair_value_uint16(nvp, &i16); printf(gettext("0x%x"), i16); break; case DATA_TYPE_INT32: (void) nvpair_value_int32(nvp, (void *)&i32); printf(gettext("0x%x"), i32); break; case DATA_TYPE_UINT32: (void) nvpair_value_uint32(nvp, &i32); printf(gettext("0x%x"), i32); break; case DATA_TYPE_INT64: (void) nvpair_value_int64(nvp, (void *)&i64); printf(gettext("0x%llx"), (u_longlong_t)i64); break; case DATA_TYPE_UINT64: (void) nvpair_value_uint64(nvp, &i64); /* * translate vdev state values to readable * strings to aide zpool events consumers */ if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 || strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) { printf(gettext("\"%s\" (0x%llx)"), zpool_state_to_name(i64, VDEV_AUX_NONE), (u_longlong_t)i64); } else { printf(gettext("0x%llx"), (u_longlong_t)i64); } break; case DATA_TYPE_HRTIME: (void) nvpair_value_hrtime(nvp, (void *)&i64); printf(gettext("0x%llx"), (u_longlong_t)i64); break; case DATA_TYPE_STRING: (void) nvpair_value_string(nvp, &str); printf(gettext("\"%s\""), str ? str : ""); break; case DATA_TYPE_NVLIST: printf(gettext("(embedded nvlist)\n")); (void) nvpair_value_nvlist(nvp, &cnv); zpool_do_events_nvprint(cnv, depth + 8); printf(gettext("%*s(end %s)"), depth, "", name); break; case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **val; uint_t i, nelem; (void) nvpair_value_nvlist_array(nvp, &val, &nelem); printf(gettext("(%d embedded nvlists)\n"), nelem); for (i = 0; i < nelem; i++) { printf(gettext("%*s%s[%d] = %s\n"), depth, "", name, i, "(embedded nvlist)"); zpool_do_events_nvprint(val[i], depth + 8); printf(gettext("%*s(end %s[%i])\n"), depth, "", name, i); } printf(gettext("%*s(end %s)\n"), depth, "", name); } break; case DATA_TYPE_INT8_ARRAY: { int8_t *val; uint_t i, nelem; (void) nvpair_value_int8_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT8_ARRAY: { uint8_t *val; uint_t i, nelem; (void) nvpair_value_uint8_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT16_ARRAY: { int16_t *val; uint_t i, nelem; (void) nvpair_value_int16_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT16_ARRAY: { uint16_t *val; uint_t i, nelem; (void) nvpair_value_uint16_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT32_ARRAY: { int32_t *val; uint_t i, nelem; (void) nvpair_value_int32_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT32_ARRAY: { uint32_t *val; uint_t i, nelem; (void) nvpair_value_uint32_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT64_ARRAY: { int64_t *val; uint_t i, nelem; (void) nvpair_value_int64_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%llx "), (u_longlong_t)val[i]); break; } case DATA_TYPE_UINT64_ARRAY: { uint64_t *val; uint_t i, nelem; (void) nvpair_value_uint64_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%llx "), (u_longlong_t)val[i]); break; } case DATA_TYPE_STRING_ARRAY: { const char **str; uint_t i, nelem; (void) nvpair_value_string_array(nvp, &str, &nelem); for (i = 0; i < nelem; i++) printf(gettext("\"%s\" "), str[i] ? str[i] : ""); break; } case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_BYTE_ARRAY: case DATA_TYPE_DOUBLE: case DATA_TYPE_DONTCARE: case DATA_TYPE_UNKNOWN: printf(gettext("")); break; } printf(gettext("\n")); } } static int zpool_do_events_next(ev_opts_t *opts) { nvlist_t *nvl; int zevent_fd, ret, dropped; const char *pool; zevent_fd = open(ZFS_DEV, O_RDWR); VERIFY(zevent_fd >= 0); if (!opts->scripted) (void) printf(gettext("%-30s %s\n"), "TIME", "CLASS"); while (1) { ret = zpool_events_next(g_zfs, &nvl, &dropped, (opts->follow ? ZEVENT_NONE : ZEVENT_NONBLOCK), zevent_fd); if (ret || nvl == NULL) break; if (dropped > 0) (void) printf(gettext("dropped %d events\n"), dropped); if (strlen(opts->poolname) > 0 && nvlist_lookup_string(nvl, FM_FMRI_ZFS_POOL, &pool) == 0 && strcmp(opts->poolname, pool) != 0) continue; zpool_do_events_short(nvl, opts); if (opts->verbose) { zpool_do_events_nvprint(nvl, 8); printf(gettext("\n")); } (void) fflush(stdout); nvlist_free(nvl); } VERIFY(0 == close(zevent_fd)); return (ret); } static int zpool_do_events_clear(void) { int count, ret; ret = zpool_events_clear(g_zfs, &count); if (!ret) (void) printf(gettext("cleared %d events\n"), count); return (ret); } /* * zpool events [-vHf [pool] | -c] * * Displays events logs by ZFS. */ int zpool_do_events(int argc, char **argv) { ev_opts_t opts = { 0 }; int ret; int c; /* check options */ while ((c = getopt(argc, argv, "vHfc")) != -1) { switch (c) { case 'v': opts.verbose = 1; break; case 'H': opts.scripted = 1; break; case 'f': opts.follow = 1; break; case 'c': opts.clear = 1; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } else if (argc == 1) { (void) strlcpy(opts.poolname, argv[0], sizeof (opts.poolname)); if (!zfs_name_valid(opts.poolname, ZFS_TYPE_POOL)) { (void) fprintf(stderr, gettext("invalid pool name '%s'\n"), opts.poolname); usage(B_FALSE); } } if ((argc == 1 || opts.verbose || opts.scripted || opts.follow) && opts.clear) { (void) fprintf(stderr, gettext("invalid options combined with -c\n")); usage(B_FALSE); } if (opts.clear) ret = zpool_do_events_clear(); else ret = zpool_do_events_next(&opts); return (ret); } static int get_callback_vdev(zpool_handle_t *zhp, char *vdevname, void *data) { zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; char value[ZFS_MAXPROPLEN]; zprop_source_t srctype; for (zprop_list_t *pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { char *prop_name; /* * If the first property is pool name, it is a special * placeholder that we can skip. This will also skip * over the name property when 'all' is specified. */ if (pl->pl_prop == ZPOOL_PROP_NAME && pl == cbp->cb_proplist) continue; if (pl->pl_prop == ZPROP_INVAL) { prop_name = pl->pl_user_prop; } else { prop_name = (char *)vdev_prop_to_name(pl->pl_prop); } if (zpool_get_vdev_prop(zhp, vdevname, pl->pl_prop, prop_name, value, sizeof (value), &srctype, cbp->cb_literal) == 0) { zprop_print_one_property(vdevname, cbp, prop_name, value, srctype, NULL, NULL); } } return (0); } static int get_callback_vdev_cb(void *zhp_data, nvlist_t *nv, void *data) { zpool_handle_t *zhp = zhp_data; zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; char *vdevname; const char *type; int ret; /* * zpool_vdev_name() transforms the root vdev name (i.e., root-0) to the * pool name for display purposes, which is not desired. Fallback to * zpool_vdev_name() when not dealing with the root vdev. */ type = fnvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE); if (zhp != NULL && strcmp(type, "root") == 0) vdevname = strdup("root-0"); else vdevname = zpool_vdev_name(g_zfs, zhp, nv, cbp->cb_vdevs.cb_name_flags); (void) vdev_expand_proplist(zhp, vdevname, &cbp->cb_proplist); ret = get_callback_vdev(zhp, vdevname, data); free(vdevname); return (ret); } static int get_callback(zpool_handle_t *zhp, void *data) { zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; char value[ZFS_MAXPROPLEN]; zprop_source_t srctype; zprop_list_t *pl; int vid; if (cbp->cb_type == ZFS_TYPE_VDEV) { if (strcmp(cbp->cb_vdevs.cb_names[0], "all-vdevs") == 0) { for_each_vdev(zhp, get_callback_vdev_cb, data); } else { /* Adjust column widths for vdev properties */ for (vid = 0; vid < cbp->cb_vdevs.cb_names_count; vid++) { vdev_expand_proplist(zhp, cbp->cb_vdevs.cb_names[vid], &cbp->cb_proplist); } /* Display the properties */ for (vid = 0; vid < cbp->cb_vdevs.cb_names_count; vid++) { get_callback_vdev(zhp, cbp->cb_vdevs.cb_names[vid], data); } } } else { assert(cbp->cb_type == ZFS_TYPE_POOL); for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { /* * Skip the special fake placeholder. This will also * skip over the name property when 'all' is specified. */ if (pl->pl_prop == ZPOOL_PROP_NAME && pl == cbp->cb_proplist) continue; if (pl->pl_prop == ZPROP_INVAL && zfs_prop_user(pl->pl_user_prop)) { srctype = ZPROP_SRC_LOCAL; if (zpool_get_userprop(zhp, pl->pl_user_prop, value, sizeof (value), &srctype) != 0) continue; zprop_print_one_property(zpool_get_name(zhp), cbp, pl->pl_user_prop, value, srctype, NULL, NULL); } else if (pl->pl_prop == ZPROP_INVAL && (zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop))) { srctype = ZPROP_SRC_LOCAL; if (zpool_prop_get_feature(zhp, pl->pl_user_prop, value, sizeof (value)) == 0) { zprop_print_one_property( zpool_get_name(zhp), cbp, pl->pl_user_prop, value, srctype, NULL, NULL); } } else { if (zpool_get_prop(zhp, pl->pl_prop, value, sizeof (value), &srctype, cbp->cb_literal) != 0) continue; zprop_print_one_property(zpool_get_name(zhp), cbp, zpool_prop_to_name(pl->pl_prop), value, srctype, NULL, NULL); } } } return (0); } /* * zpool get [-Hp] [-o "all" | field[,...]] <"all" | property[,...]> ... * * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -o List of columns to display. Defaults to * "name,property,value,source". * -p Display values in parsable (exact) format. * * Get properties of pools in the system. Output space statistics * for each one as well as other attributes. */ int zpool_do_get(int argc, char **argv) { zprop_get_cbdata_t cb = { 0 }; zprop_list_t fake_name = { 0 }; int ret; int c, i; char *propstr = NULL; char *vdev = NULL; cb.cb_first = B_TRUE; /* * Set up default columns and sources. */ cb.cb_sources = ZPROP_SRC_ALL; cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; cb.cb_type = ZFS_TYPE_POOL; cb.cb_vdevs.cb_name_flags |= VDEV_NAME_TYPE_ID; current_prop_type = cb.cb_type; /* check options */ while ((c = getopt(argc, argv, ":Hpo:")) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; break; case 'H': cb.cb_scripted = B_TRUE; break; case 'o': memset(&cb.cb_columns, 0, sizeof (cb.cb_columns)); i = 0; for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const col_opts[] = { "name", "property", "value", "source", "all" }; static const zfs_get_column_t col_cols[] = { GET_COL_NAME, GET_COL_PROPERTY, GET_COL_VALUE, GET_COL_SOURCE }; if (i == ZFS_GET_NCOLS - 1) { (void) fprintf(stderr, gettext("too " "many fields given to -o " "option\n")); usage(B_FALSE); } for (c = 0; c < ARRAY_SIZE(col_opts); ++c) if (strcmp(tok, col_opts[c]) == 0) goto found; (void) fprintf(stderr, gettext("invalid column name '%s'\n"), tok); usage(B_FALSE); found: if (c >= 4) { if (i > 0) { (void) fprintf(stderr, gettext("\"all\" conflicts " "with specific fields " "given to -o option\n")); usage(B_FALSE); } memcpy(cb.cb_columns, col_cols, sizeof (col_cols)); i = ZFS_GET_NCOLS - 1; } else cb.cb_columns[i++] = col_cols[c]; } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing property " "argument\n")); usage(B_FALSE); } /* Properties list is needed later by zprop_get_list() */ propstr = argv[0]; argc--; argv++; if (argc == 0) { /* No args, so just print the defaults. */ } else if (are_all_pools(argc, argv)) { /* All the args are pool names */ } else if (are_all_pools(1, argv)) { /* The first arg is a pool name */ if ((argc == 2 && strcmp(argv[1], "all-vdevs") == 0) || (argc == 2 && strcmp(argv[1], "root") == 0) || are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb.cb_vdevs)) { if (strcmp(argv[1], "root") == 0) vdev = strdup("root-0"); else vdev = strdup(argv[1]); /* ... and the rest are vdev names */ cb.cb_vdevs.cb_names = &vdev; cb.cb_vdevs.cb_names_count = argc - 1; cb.cb_type = ZFS_TYPE_VDEV; argc = 1; /* One pool to process */ } else { fprintf(stderr, gettext("Expected a list of vdevs in" " \"%s\", but got:\n"), argv[0]); error_list_unresolved_vdevs(argc - 1, argv + 1, argv[0], &cb.cb_vdevs); fprintf(stderr, "\n"); usage(B_FALSE); return (1); } } else { /* * The first arg isn't a pool name, */ fprintf(stderr, gettext("missing pool name.\n")); fprintf(stderr, "\n"); usage(B_FALSE); return (1); } if (zprop_get_list(g_zfs, propstr, &cb.cb_proplist, cb.cb_type) != 0) { /* Use correct list of valid properties (pool or vdev) */ current_prop_type = cb.cb_type; usage(B_FALSE); } if (cb.cb_proplist != NULL) { fake_name.pl_prop = ZPOOL_PROP_NAME; fake_name.pl_width = strlen(gettext("NAME")); fake_name.pl_next = cb.cb_proplist; cb.cb_proplist = &fake_name; } ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, cb.cb_type, cb.cb_literal, get_callback, &cb); if (cb.cb_proplist == &fake_name) zprop_free_list(fake_name.pl_next); else zprop_free_list(cb.cb_proplist); if (vdev != NULL) free(vdev); return (ret); } typedef struct set_cbdata { char *cb_propname; char *cb_value; zfs_type_t cb_type; vdev_cbdata_t cb_vdevs; boolean_t cb_any_successful; } set_cbdata_t; static int set_pool_callback(zpool_handle_t *zhp, set_cbdata_t *cb) { int error; /* Check if we have out-of-bounds features */ if (strcmp(cb->cb_propname, ZPOOL_CONFIG_COMPATIBILITY) == 0) { boolean_t features[SPA_FEATURES]; if (zpool_do_load_compat(cb->cb_value, features) != ZPOOL_COMPATIBILITY_OK) return (-1); nvlist_t *enabled = zpool_get_features(zhp); spa_feature_t i; for (i = 0; i < SPA_FEATURES; i++) { const char *fguid = spa_feature_table[i].fi_guid; if (nvlist_exists(enabled, fguid) && !features[i]) break; } if (i < SPA_FEATURES) (void) fprintf(stderr, gettext("Warning: one or " "more features already enabled on pool '%s'\n" "are not present in this compatibility set.\n"), zpool_get_name(zhp)); } /* if we're setting a feature, check it's in compatibility set */ if (zpool_prop_feature(cb->cb_propname) && strcmp(cb->cb_value, ZFS_FEATURE_ENABLED) == 0) { char *fname = strchr(cb->cb_propname, '@') + 1; spa_feature_t f; if (zfeature_lookup_name(fname, &f) == 0) { char compat[ZFS_MAXPROPLEN]; if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compat, ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) compat[0] = '\0'; boolean_t features[SPA_FEATURES]; if (zpool_do_load_compat(compat, features) != ZPOOL_COMPATIBILITY_OK) { (void) fprintf(stderr, gettext("Error: " "cannot enable feature '%s' on pool '%s'\n" "because the pool's 'compatibility' " "property cannot be parsed.\n"), fname, zpool_get_name(zhp)); return (-1); } if (!features[f]) { (void) fprintf(stderr, gettext("Error: " "cannot enable feature '%s' on pool '%s'\n" "as it is not specified in this pool's " "current compatibility set.\n" "Consider setting 'compatibility' to a " "less restrictive set, or to 'off'.\n"), fname, zpool_get_name(zhp)); return (-1); } } } error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value); return (error); } static int set_callback(zpool_handle_t *zhp, void *data) { int error; set_cbdata_t *cb = (set_cbdata_t *)data; if (cb->cb_type == ZFS_TYPE_VDEV) { error = zpool_set_vdev_prop(zhp, *cb->cb_vdevs.cb_names, cb->cb_propname, cb->cb_value); } else { assert(cb->cb_type == ZFS_TYPE_POOL); error = set_pool_callback(zhp, cb); } cb->cb_any_successful = !error; return (error); } int zpool_do_set(int argc, char **argv) { set_cbdata_t cb = { 0 }; int error; char *vdev = NULL; current_prop_type = ZFS_TYPE_POOL; if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing property=value " "argument\n")); usage(B_FALSE); } if (argc < 3) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 4) { (void) fprintf(stderr, gettext("too many pool names\n")); usage(B_FALSE); } cb.cb_propname = argv[1]; cb.cb_type = ZFS_TYPE_POOL; cb.cb_vdevs.cb_name_flags |= VDEV_NAME_TYPE_ID; cb.cb_value = strchr(cb.cb_propname, '='); if (cb.cb_value == NULL) { (void) fprintf(stderr, gettext("missing value in " "property=value argument\n")); usage(B_FALSE); } *(cb.cb_value) = '\0'; cb.cb_value++; argc -= 2; argv += 2; /* argv[0] is pool name */ if (!is_pool(argv[0])) { (void) fprintf(stderr, gettext("cannot open '%s': is not a pool\n"), argv[0]); return (EINVAL); } /* argv[1], when supplied, is vdev name */ if (argc == 2) { if (strcmp(argv[1], "root") == 0) vdev = strdup("root-0"); else vdev = strdup(argv[1]); if (!are_vdevs_in_pool(1, &vdev, argv[0], &cb.cb_vdevs)) { (void) fprintf(stderr, gettext( "cannot find '%s' in '%s': device not in pool\n"), vdev, argv[0]); free(vdev); return (EINVAL); } cb.cb_vdevs.cb_names = &vdev; cb.cb_vdevs.cb_names_count = 1; cb.cb_type = ZFS_TYPE_VDEV; } error = for_each_pool(1, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, set_callback, &cb); if (vdev != NULL) free(vdev); return (error); } /* Add up the total number of bytes left to initialize/trim across all vdevs */ static uint64_t vdev_activity_remaining(nvlist_t *nv, zpool_wait_activity_t activity) { uint64_t bytes_remaining; nvlist_t **child; uint_t c, children; vdev_stat_t *vs; assert(activity == ZPOOL_WAIT_INITIALIZE || activity == ZPOOL_WAIT_TRIM); verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (activity == ZPOOL_WAIT_INITIALIZE && vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) bytes_remaining = vs->vs_initialize_bytes_est - vs->vs_initialize_bytes_done; else if (activity == ZPOOL_WAIT_TRIM && vs->vs_trim_state == VDEV_TRIM_ACTIVE) bytes_remaining = vs->vs_trim_bytes_est - vs->vs_trim_bytes_done; else bytes_remaining = 0; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (c = 0; c < children; c++) bytes_remaining += vdev_activity_remaining(child[c], activity); return (bytes_remaining); } /* Add up the total number of bytes left to rebuild across top-level vdevs */ static uint64_t vdev_activity_top_remaining(nvlist_t *nv) { uint64_t bytes_remaining = 0; nvlist_t **child; uint_t children; int error; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (uint_t c = 0; c < children; c++) { vdev_rebuild_stat_t *vrs; uint_t i; error = nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i); if (error == 0) { if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { bytes_remaining += (vrs->vrs_bytes_est - vrs->vrs_bytes_rebuilt); } } } return (bytes_remaining); } /* Whether any vdevs are 'spare' or 'replacing' vdevs */ static boolean_t vdev_any_spare_replacing(nvlist_t *nv) { nvlist_t **child; uint_t c, children; const char *vdev_type; (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type); if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 || strcmp(vdev_type, VDEV_TYPE_SPARE) == 0 || strcmp(vdev_type, VDEV_TYPE_DRAID_SPARE) == 0) { return (B_TRUE); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (c = 0; c < children; c++) { if (vdev_any_spare_replacing(child[c])) return (B_TRUE); } return (B_FALSE); } typedef struct wait_data { char *wd_poolname; boolean_t wd_scripted; boolean_t wd_exact; boolean_t wd_headers_once; boolean_t wd_should_exit; /* Which activities to wait for */ boolean_t wd_enabled[ZPOOL_WAIT_NUM_ACTIVITIES]; float wd_interval; pthread_cond_t wd_cv; pthread_mutex_t wd_mutex; } wait_data_t; /* * Print to stdout a single line, containing one column for each activity that * we are waiting for specifying how many bytes of work are left for that * activity. */ static void print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) { nvlist_t *config, *nvroot; uint_t c; int i; pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *pss = NULL; pool_removal_stat_t *prs = NULL; const char *const headers[] = {"DISCARD", "FREE", "INITIALIZE", "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM"}; int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES]; /* Calculate the width of each column */ for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { /* * Make sure we have enough space in the col for pretty-printed * numbers and for the column header, and then leave a couple * spaces between cols for readability. */ col_widths[i] = MAX(strlen(headers[i]), 6) + 2; } /* Print header if appropriate */ int term_height = terminal_height(); boolean_t reprint_header = (!wd->wd_headers_once && term_height > 0 && row % (term_height-1) == 0); if (!wd->wd_scripted && (row == 0 || reprint_header)) { for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { if (wd->wd_enabled[i]) (void) printf("%*s", col_widths[i], headers[i]); } (void) fputc('\n', stdout); } /* Bytes of work remaining in each activity */ int64_t bytes_rem[ZPOOL_WAIT_NUM_ACTIVITIES] = {0}; bytes_rem[ZPOOL_WAIT_FREE] = zpool_get_prop_int(zhp, ZPOOL_PROP_FREEING, NULL); config = zpool_get_config(zhp, NULL); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); if (pcs != NULL && pcs->pcs_state == CS_CHECKPOINT_DISCARDING) bytes_rem[ZPOOL_WAIT_CKPT_DISCARD] = pcs->pcs_space; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); if (prs != NULL && prs->prs_state == DSS_SCANNING) bytes_rem[ZPOOL_WAIT_REMOVE] = prs->prs_to_copy - prs->prs_copied; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&pss, &c); if (pss != NULL && pss->pss_state == DSS_SCANNING && pss->pss_pass_scrub_pause == 0) { int64_t rem = pss->pss_to_examine - pss->pss_issued; if (pss->pss_func == POOL_SCAN_SCRUB) bytes_rem[ZPOOL_WAIT_SCRUB] = rem; else bytes_rem[ZPOOL_WAIT_RESILVER] = rem; } else if (check_rebuilding(nvroot, NULL)) { bytes_rem[ZPOOL_WAIT_RESILVER] = vdev_activity_top_remaining(nvroot); } bytes_rem[ZPOOL_WAIT_INITIALIZE] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE); bytes_rem[ZPOOL_WAIT_TRIM] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_TRIM); /* * A replace finishes after resilvering finishes, so the amount of work * left for a replace is the same as for resilvering. * * It isn't quite correct to say that if we have any 'spare' or * 'replacing' vdevs and a resilver is happening, then a replace is in * progress, like we do here. When a hot spare is used, the faulted vdev * is not removed after the hot spare is resilvered, so parent 'spare' * vdev is not removed either. So we could have a 'spare' vdev, but be * resilvering for a different reason. However, we use it as a heuristic * because we don't have access to the DTLs, which could tell us whether * or not we have really finished resilvering a hot spare. */ if (vdev_any_spare_replacing(nvroot)) bytes_rem[ZPOOL_WAIT_REPLACE] = bytes_rem[ZPOOL_WAIT_RESILVER]; if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { char buf[64]; if (!wd->wd_enabled[i]) continue; if (wd->wd_exact) (void) snprintf(buf, sizeof (buf), "%" PRIi64, bytes_rem[i]); else zfs_nicenum(bytes_rem[i], buf, sizeof (buf)); if (wd->wd_scripted) (void) printf(i == 0 ? "%s" : "\t%s", buf); else (void) printf(" %*s", col_widths[i] - 1, buf); } (void) printf("\n"); (void) fflush(stdout); } static void * wait_status_thread(void *arg) { wait_data_t *wd = (wait_data_t *)arg; zpool_handle_t *zhp; if ((zhp = zpool_open(g_zfs, wd->wd_poolname)) == NULL) return (void *)(1); for (int row = 0; ; row++) { boolean_t missing; struct timespec timeout; int ret = 0; (void) clock_gettime(CLOCK_REALTIME, &timeout); if (zpool_refresh_stats(zhp, &missing) != 0 || missing || zpool_props_refresh(zhp) != 0) { zpool_close(zhp); return (void *)(uintptr_t)(missing ? 0 : 1); } print_wait_status_row(wd, zhp, row); timeout.tv_sec += floor(wd->wd_interval); long nanos = timeout.tv_nsec + (wd->wd_interval - floor(wd->wd_interval)) * NANOSEC; if (nanos >= NANOSEC) { timeout.tv_sec++; timeout.tv_nsec = nanos - NANOSEC; } else { timeout.tv_nsec = nanos; } pthread_mutex_lock(&wd->wd_mutex); if (!wd->wd_should_exit) ret = pthread_cond_timedwait(&wd->wd_cv, &wd->wd_mutex, &timeout); pthread_mutex_unlock(&wd->wd_mutex); if (ret == 0) { break; /* signaled by main thread */ } else if (ret != ETIMEDOUT) { (void) fprintf(stderr, gettext("pthread_cond_timedwait " "failed: %s\n"), strerror(ret)); zpool_close(zhp); return (void *)(uintptr_t)(1); } } zpool_close(zhp); return (void *)(0); } int zpool_do_wait(int argc, char **argv) { boolean_t verbose = B_FALSE; int c, i; unsigned long count; pthread_t status_thr; int error = 0; zpool_handle_t *zhp; wait_data_t wd; wd.wd_scripted = B_FALSE; wd.wd_exact = B_FALSE; wd.wd_headers_once = B_FALSE; wd.wd_should_exit = B_FALSE; pthread_mutex_init(&wd.wd_mutex, NULL); pthread_cond_init(&wd.wd_cv, NULL); /* By default, wait for all types of activity. */ for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) wd.wd_enabled[i] = B_TRUE; while ((c = getopt(argc, argv, "HpT:t:")) != -1) { switch (c) { case 'H': wd.wd_scripted = B_TRUE; break; case 'n': wd.wd_headers_once = B_TRUE; break; case 'p': wd.wd_exact = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 't': /* Reset activities array */ memset(&wd.wd_enabled, 0, sizeof (wd.wd_enabled)); for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const col_opts[] = { "discard", "free", "initialize", "replace", "remove", "resilver", "scrub", "trim" }; for (i = 0; i < ARRAY_SIZE(col_opts); ++i) if (strcmp(tok, col_opts[i]) == 0) { wd.wd_enabled[i] = B_TRUE; goto found; } (void) fprintf(stderr, gettext("invalid activity '%s'\n"), tok); usage(B_FALSE); found:; } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &wd.wd_interval, &count); if (count != 0) { /* This subcmd only accepts an interval, not a count */ (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (wd.wd_interval != 0) verbose = B_TRUE; if (argc < 1) { (void) fprintf(stderr, gettext("missing 'pool' argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } wd.wd_poolname = argv[0]; if ((zhp = zpool_open(g_zfs, wd.wd_poolname)) == NULL) return (1); if (verbose) { /* * We use a separate thread for printing status updates because * the main thread will call lzc_wait(), which blocks as long * as an activity is in progress, which can be a long time. */ if (pthread_create(&status_thr, NULL, wait_status_thread, &wd) != 0) { (void) fprintf(stderr, gettext("failed to create status" "thread: %s\n"), strerror(errno)); zpool_close(zhp); return (1); } } /* * Loop over all activities that we are supposed to wait for until none * of them are in progress. Note that this means we can end up waiting * for more activities to complete than just those that were in progress * when we began waiting; if an activity we are interested in begins * while we are waiting for another activity, we will wait for both to * complete before exiting. */ for (;;) { boolean_t missing = B_FALSE; boolean_t any_waited = B_FALSE; for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { boolean_t waited; if (!wd.wd_enabled[i]) continue; error = zpool_wait_status(zhp, i, &missing, &waited); if (error != 0 || missing) break; any_waited = (any_waited || waited); } if (error != 0 || missing || !any_waited) break; } zpool_close(zhp); if (verbose) { uintptr_t status; pthread_mutex_lock(&wd.wd_mutex); wd.wd_should_exit = B_TRUE; pthread_cond_signal(&wd.wd_cv); pthread_mutex_unlock(&wd.wd_mutex); (void) pthread_join(status_thr, (void *)&status); if (status != 0) error = status; } pthread_mutex_destroy(&wd.wd_mutex); pthread_cond_destroy(&wd.wd_cv); return (error); } static int find_command_idx(const char *command, int *idx) { for (int i = 0; i < NCOMMAND; ++i) { if (command_table[i].name == NULL) continue; if (strcmp(command, command_table[i].name) == 0) { *idx = i; return (0); } } return (1); } /* * Display version message */ static int zpool_do_version(int argc, char **argv) { (void) argc, (void) argv; return (zfs_version_print() != 0); } /* Display documentation */ static int zpool_do_help(int argc, char **argv) { char page[MAXNAMELEN]; if (argc < 3 || strcmp(argv[2], "zpool") == 0) strcpy(page, "zpool"); else if (strcmp(argv[2], "concepts") == 0 || strcmp(argv[2], "props") == 0) snprintf(page, sizeof (page), "zpool%s", argv[2]); else snprintf(page, sizeof (page), "zpool-%s", argv[2]); execlp("man", "man", page, NULL); fprintf(stderr, "couldn't run man program: %s", strerror(errno)); return (-1); } /* * Do zpool_load_compat() and print error message on failure */ static zpool_compat_status_t zpool_do_load_compat(const char *compat, boolean_t *list) { char report[1024]; zpool_compat_status_t ret; ret = zpool_load_compat(compat, list, report, 1024); switch (ret) { case ZPOOL_COMPATIBILITY_OK: break; case ZPOOL_COMPATIBILITY_NOFILES: case ZPOOL_COMPATIBILITY_BADFILE: case ZPOOL_COMPATIBILITY_BADTOKEN: (void) fprintf(stderr, "Error: %s\n", report); break; case ZPOOL_COMPATIBILITY_WARNTOKEN: (void) fprintf(stderr, "Warning: %s\n", report); ret = ZPOOL_COMPATIBILITY_OK; break; } return (ret); } int main(int argc, char **argv) { int ret = 0; int i = 0; char *cmdname; char **newargv; (void) setlocale(LC_ALL, ""); (void) setlocale(LC_NUMERIC, "C"); (void) textdomain(TEXT_DOMAIN); srand(time(NULL)); opterr = 0; /* * Make sure the user has specified some command. */ if (argc < 2) { (void) fprintf(stderr, gettext("missing command\n")); usage(B_FALSE); } cmdname = argv[1]; /* * Special case '-?' */ if ((strcmp(cmdname, "-?") == 0) || strcmp(cmdname, "--help") == 0) usage(B_TRUE); /* * Special case '-V|--version' */ if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) return (zpool_do_version(argc, argv)); /* * Special case 'help' */ if (strcmp(cmdname, "help") == 0) return (zpool_do_help(argc, argv)); if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (1); } libzfs_print_on_error(g_zfs, B_TRUE); zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); /* * Many commands modify input strings for string parsing reasons. * We create a copy to protect the original argv. */ newargv = safe_malloc((argc + 1) * sizeof (newargv[0])); for (i = 0; i < argc; i++) newargv[i] = strdup(argv[i]); newargv[argc] = NULL; /* * Run the appropriate command. */ if (find_command_idx(cmdname, &i) == 0) { current_command = &command_table[i]; ret = command_table[i].func(argc - 1, newargv + 1); } else if (strchr(cmdname, '=')) { verify(find_command_idx("set", &i) == 0); current_command = &command_table[i]; ret = command_table[i].func(argc, newargv); } else if (strcmp(cmdname, "freeze") == 0 && argc == 3) { /* * 'freeze' is a vile debugging abomination, so we treat * it as such. */ zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, argv[2], sizeof (zc.zc_name)); ret = zfs_ioctl(g_zfs, ZFS_IOC_POOL_FREEZE, &zc); if (ret != 0) { (void) fprintf(stderr, gettext("failed to freeze pool: %d\n"), errno); ret = 1; } log_history = 0; } else { (void) fprintf(stderr, gettext("unrecognized " "command '%s'\n"), cmdname); usage(B_FALSE); ret = 1; } for (i = 0; i < argc; i++) free(newargv[i]); free(newargv); if (ret == 0 && log_history) (void) zpool_log_history(g_zfs, history_str); libzfs_fini(g_zfs); /* * The 'ZFS_ABORT' environment variable causes us to dump core on exit * for the purposes of running ::findleaks. */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } return (ret); } diff --git a/sys/contrib/openzfs/config/zfs-build.m4 b/sys/contrib/openzfs/config/zfs-build.m4 index 5ea6aa29a3de..e4197dc1424e 100644 --- a/sys/contrib/openzfs/config/zfs-build.m4 +++ b/sys/contrib/openzfs/config/zfs-build.m4 @@ -1,643 +1,646 @@ AC_DEFUN([ZFS_AC_LICENSE], [ AC_MSG_CHECKING([zfs author]) AC_MSG_RESULT([$ZFS_META_AUTHOR]) AC_MSG_CHECKING([zfs license]) AC_MSG_RESULT([$ZFS_META_LICENSE]) ]) AC_DEFUN([ZFS_AC_DEBUG_ENABLE], [ DEBUG_CFLAGS="-Werror" DEBUG_CPPFLAGS="-DDEBUG -UNDEBUG" DEBUG_LDFLAGS="" DEBUG_ZFS="_with_debug" WITH_DEBUG="true" AC_DEFINE(ZFS_DEBUG, 1, [zfs debugging enabled]) KERNEL_DEBUG_CFLAGS="-Werror" KERNEL_DEBUG_CPPFLAGS="-DDEBUG -UNDEBUG" ]) AC_DEFUN([ZFS_AC_DEBUG_DISABLE], [ DEBUG_CFLAGS="" DEBUG_CPPFLAGS="-UDEBUG -DNDEBUG" DEBUG_LDFLAGS="" DEBUG_ZFS="_without_debug" WITH_DEBUG="" KERNEL_DEBUG_CFLAGS="" KERNEL_DEBUG_CPPFLAGS="-UDEBUG -DNDEBUG" ]) dnl # dnl # When debugging is enabled: dnl # - Enable all ASSERTs (-DDEBUG) dnl # - Promote all compiler warnings to errors (-Werror) dnl # dnl # (If INVARIANTS is detected, we need to force DEBUG, or strange panics dnl # can ensue.) dnl # AC_DEFUN([ZFS_AC_DEBUG], [ AC_MSG_CHECKING([whether assertion support will be enabled]) AC_ARG_ENABLE([debug], [AS_HELP_STRING([--enable-debug], [Enable compiler and code assertions @<:@default=no@:>@])], [], [enable_debug=no]) AS_CASE(["x$enable_debug"], ["xyes"], [ZFS_AC_DEBUG_ENABLE], ["xno"], [ZFS_AC_DEBUG_DISABLE], [AC_MSG_ERROR([Unknown option $enable_debug])]) AS_CASE(["x$enable_invariants"], ["xyes"], [], ["xno"], [], [ZFS_AC_DEBUG_INVARIANTS_DETECT]) AS_CASE(["x$enable_invariants"], ["xyes"], [ZFS_AC_DEBUG_ENABLE], ["xno"], [], [AC_MSG_ERROR([Unknown option $enable_invariants])]) AC_SUBST(DEBUG_CFLAGS) AC_SUBST(DEBUG_CPPFLAGS) AC_SUBST(DEBUG_LDFLAGS) AC_SUBST(DEBUG_ZFS) AC_SUBST(WITH_DEBUG) AC_SUBST(KERNEL_DEBUG_CFLAGS) AC_SUBST(KERNEL_DEBUG_CPPFLAGS) AC_MSG_RESULT([$enable_debug]) ]) AC_DEFUN([ZFS_AC_DEBUGINFO_ENABLE], [ DEBUG_CFLAGS="$DEBUG_CFLAGS -g -fno-inline $NO_IPA_SRA" KERNEL_DEBUG_CFLAGS="$KERNEL_DEBUG_CFLAGS -fno-inline $KERNEL_NO_IPA_SRA" KERNEL_MAKE="$KERNEL_MAKE CONFIG_DEBUG_INFO=y" DEBUGINFO_ZFS="_with_debuginfo" ]) AC_DEFUN([ZFS_AC_DEBUGINFO_DISABLE], [ DEBUGINFO_ZFS="_without_debuginfo" ]) AC_DEFUN([ZFS_AC_DEBUGINFO], [ AC_MSG_CHECKING([whether debuginfo support will be forced]) AC_ARG_ENABLE([debuginfo], [AS_HELP_STRING([--enable-debuginfo], [Force generation of debuginfo @<:@default=no@:>@])], [], [enable_debuginfo=no]) AS_CASE(["x$enable_debuginfo"], ["xyes"], [ZFS_AC_DEBUGINFO_ENABLE], ["xno"], [ZFS_AC_DEBUGINFO_DISABLE], [AC_MSG_ERROR([Unknown option $enable_debuginfo])]) AC_SUBST(DEBUG_CFLAGS) AC_SUBST(DEBUGINFO_ZFS) AC_SUBST(KERNEL_DEBUG_CFLAGS) AC_SUBST(KERNEL_MAKE) AC_MSG_RESULT([$enable_debuginfo]) ]) dnl # dnl # Disabled by default, provides basic memory tracking. Track the total dnl # number of bytes allocated with kmem_alloc() and freed with kmem_free(). dnl # Then at module unload time if any bytes were leaked it will be reported dnl # on the console. dnl # AC_DEFUN([ZFS_AC_DEBUG_KMEM], [ AC_MSG_CHECKING([whether basic kmem accounting is enabled]) AC_ARG_ENABLE([debug-kmem], [AS_HELP_STRING([--enable-debug-kmem], [Enable basic kmem accounting @<:@default=no@:>@])], [], [enable_debug_kmem=no]) AS_IF([test "x$enable_debug_kmem" = xyes], [ KERNEL_DEBUG_CPPFLAGS="${KERNEL_DEBUG_CPPFLAGS} -DDEBUG_KMEM" DEBUG_KMEM_ZFS="_with_debug_kmem" ], [ DEBUG_KMEM_ZFS="_without_debug_kmem" ]) AC_SUBST(KERNEL_DEBUG_CPPFLAGS) AC_SUBST(DEBUG_KMEM_ZFS) AC_MSG_RESULT([$enable_debug_kmem]) ]) dnl # dnl # Disabled by default, provides detailed memory tracking. This feature dnl # also requires --enable-debug-kmem to be set. When enabled not only will dnl # total bytes be tracked but also the location of every kmem_alloc() and dnl # kmem_free(). When the module is unloaded a list of all leaked addresses dnl # and where they were allocated will be dumped to the console. Enabling dnl # this feature has a significant impact on performance but it makes finding dnl # memory leaks straight forward. dnl # AC_DEFUN([ZFS_AC_DEBUG_KMEM_TRACKING], [ AC_MSG_CHECKING([whether detailed kmem tracking is enabled]) AC_ARG_ENABLE([debug-kmem-tracking], [AS_HELP_STRING([--enable-debug-kmem-tracking], [Enable detailed kmem tracking @<:@default=no@:>@])], [], [enable_debug_kmem_tracking=no]) AS_IF([test "x$enable_debug_kmem_tracking" = xyes], [ KERNEL_DEBUG_CPPFLAGS="${KERNEL_DEBUG_CPPFLAGS} -DDEBUG_KMEM_TRACKING" DEBUG_KMEM_TRACKING_ZFS="_with_debug_kmem_tracking" ], [ DEBUG_KMEM_TRACKING_ZFS="_without_debug_kmem_tracking" ]) AC_SUBST(KERNEL_DEBUG_CPPFLAGS) AC_SUBST(DEBUG_KMEM_TRACKING_ZFS) AC_MSG_RESULT([$enable_debug_kmem_tracking]) ]) AC_DEFUN([ZFS_AC_DEBUG_INVARIANTS_DETECT_FREEBSD], [ AS_IF([sysctl -n kern.conftxt | grep -Fqx $'options\tINVARIANTS'], [enable_invariants="yes"], [enable_invariants="no"]) ]) AC_DEFUN([ZFS_AC_DEBUG_INVARIANTS_DETECT], [ AM_COND_IF([BUILD_FREEBSD], [ZFS_AC_DEBUG_INVARIANTS_DETECT_FREEBSD], [enable_invariants="no"]) ]) dnl # dnl # Detected for the running kernel by default, enables INVARIANTS features dnl # in the FreeBSD kernel module. This feature must be used when building dnl # for a FreeBSD kernel with "options INVARIANTS" in the KERNCONF and must dnl # not be used when the INVARIANTS option is absent. dnl # AC_DEFUN([ZFS_AC_DEBUG_INVARIANTS], [ AC_MSG_CHECKING([whether FreeBSD kernel INVARIANTS checks are enabled]) AC_ARG_ENABLE([invariants], [AS_HELP_STRING([--enable-invariants], [Enable FreeBSD kernel INVARIANTS checks [[default: detect]]])], [], [ZFS_AC_DEBUG_INVARIANTS_DETECT]) AS_IF([test "x$enable_invariants" = xyes], [WITH_INVARIANTS="true"], [WITH_INVARIANTS=""]) AC_SUBST(WITH_INVARIANTS) AC_MSG_RESULT([$enable_invariants]) ]) AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [ AX_COUNT_CPUS([]) AC_SUBST(CPU_COUNT) ZFS_AC_CONFIG_ALWAYS_CC_NO_CLOBBERED ZFS_AC_CONFIG_ALWAYS_CC_INFINITE_RECURSION ZFS_AC_CONFIG_ALWAYS_KERNEL_CC_INFINITE_RECURSION ZFS_AC_CONFIG_ALWAYS_CC_IMPLICIT_FALLTHROUGH ZFS_AC_CONFIG_ALWAYS_CC_FRAME_LARGER_THAN ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_TRUNCATION ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_ZERO_LENGTH ZFS_AC_CONFIG_ALWAYS_CC_FORMAT_OVERFLOW ZFS_AC_CONFIG_ALWAYS_CC_NO_OMIT_FRAME_POINTER ZFS_AC_CONFIG_ALWAYS_CC_NO_IPA_SRA ZFS_AC_CONFIG_ALWAYS_KERNEL_CC_NO_IPA_SRA ZFS_AC_CONFIG_ALWAYS_CC_ASAN ZFS_AC_CONFIG_ALWAYS_CC_UBSAN ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD ZFS_AC_CONFIG_ALWAYS_SYSTEM ZFS_AC_CONFIG_ALWAYS_ARCH ZFS_AC_CONFIG_ALWAYS_PYTHON ZFS_AC_CONFIG_ALWAYS_PYZFS ZFS_AC_CONFIG_ALWAYS_SED ZFS_AC_CONFIG_ALWAYS_CPPCHECK ZFS_AC_CONFIG_ALWAYS_SHELLCHECK ZFS_AC_CONFIG_ALWAYS_PARALLEL ]) AC_DEFUN([ZFS_AC_CONFIG], [ dnl # Remove the previous build test directory. rm -Rf build ZFS_CONFIG=all AC_ARG_WITH([config], AS_HELP_STRING([--with-config=CONFIG], [Config file 'kernel|user|all|srpm']), [ZFS_CONFIG="$withval"]) AC_ARG_ENABLE([linux-builtin], [AS_HELP_STRING([--enable-linux-builtin], [Configure for builtin in-tree kernel modules @<:@default=no@:>@])], [], [enable_linux_builtin=no]) AC_MSG_CHECKING([zfs config]) AC_MSG_RESULT([$ZFS_CONFIG]); AC_SUBST(ZFS_CONFIG) ZFS_AC_CONFIG_ALWAYS AM_COND_IF([BUILD_LINUX], [ AC_ARG_VAR([TEST_JOBS], [simultaneous jobs during configure]) if test "x$ac_cv_env_TEST_JOBS_set" != "xset"; then TEST_JOBS=$CPU_COUNT fi AC_SUBST(TEST_JOBS) ]) ZFS_INIT_SYSV= ZFS_INIT_SYSTEMD= ZFS_WANT_MODULES_LOAD_D= case "$ZFS_CONFIG" in kernel) ZFS_AC_CONFIG_KERNEL ;; user) ZFS_AC_CONFIG_USER ;; all) ZFS_AC_CONFIG_USER ZFS_AC_CONFIG_KERNEL ;; dist) ;; srpm) ;; *) AC_MSG_RESULT([Error!]) AC_MSG_ERROR([Bad value "$ZFS_CONFIG" for --with-config, user kernel|user|all|srpm]) ;; esac AM_CONDITIONAL([INIT_SYSV], [test "x$ZFS_INIT_SYSV" = "xyes"]) AM_CONDITIONAL([INIT_SYSTEMD], [test "x$ZFS_INIT_SYSTEMD" = "xyes"]) AM_CONDITIONAL([WANT_MODULES_LOAD_D], [test "x$ZFS_WANT_MODULES_LOAD_D" = "xyes"]) AM_CONDITIONAL([CONFIG_USER], [test "$ZFS_CONFIG" = user -o "$ZFS_CONFIG" = all]) AM_CONDITIONAL([CONFIG_KERNEL], [test "$ZFS_CONFIG" = kernel -o "$ZFS_CONFIG" = all] && [test "x$enable_linux_builtin" != xyes ]) AM_CONDITIONAL([CONFIG_QAT], [test "$ZFS_CONFIG" = kernel -o "$ZFS_CONFIG" = all] && [test "x$qatsrc" != x ]) AM_CONDITIONAL([WANT_DEVNAME2DEVID], [test "x$user_libudev" = xyes ]) AM_CONDITIONAL([WANT_MMAP_LIBAIO], [test "x$user_libaio" = xyes ]) AM_CONDITIONAL([PAM_ZFS_ENABLED], [test "x$enable_pam" = xyes]) ]) dnl # dnl # Check for rpm+rpmbuild to build RPM packages. If these tools dnl # are missing it is non-fatal but you will not be able to build dnl # RPM packages and will be warned if you try too. dnl # dnl # By default the generic spec file will be used because it requires dnl # minimal dependencies. Distribution specific spec files can be dnl # placed under the 'rpm/' directory and enabled using dnl # the --with-spec= configure option. dnl # AC_DEFUN([ZFS_AC_RPM], [ RPM=rpm RPMBUILD=rpmbuild AC_MSG_CHECKING([whether $RPM is available]) AS_IF([tmp=$($RPM --version 2>/dev/null)], [ RPM_VERSION=$(echo $tmp | $AWK '/RPM/ { print $[3] }') HAVE_RPM=yes AC_MSG_RESULT([$HAVE_RPM ($RPM_VERSION)]) ],[ HAVE_RPM=no AC_MSG_RESULT([$HAVE_RPM]) ]) AC_MSG_CHECKING([whether $RPMBUILD is available]) AS_IF([tmp=$($RPMBUILD --version 2>/dev/null)], [ RPMBUILD_VERSION=$(echo $tmp | $AWK '/RPM/ { print $[3] }') HAVE_RPMBUILD=yes AC_MSG_RESULT([$HAVE_RPMBUILD ($RPMBUILD_VERSION)]) ],[ HAVE_RPMBUILD=no AC_MSG_RESULT([$HAVE_RPMBUILD]) ]) RPM_DEFINE_COMMON='--define "$(DEBUG_ZFS) 1"' RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUGINFO_ZFS) 1"' RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUG_KMEM_ZFS) 1"' RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUG_KMEM_TRACKING_ZFS) 1"' RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(ASAN_ZFS) 1"' RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(UBSAN_ZFS) 1"' AS_IF([test "x$enable_debuginfo" = xyes], [ RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "__strip /bin/true"' ]) RPM_DEFINE_UTIL=' --define "_initconfdir $(initconfdir)"' dnl # Make the next three RPM_DEFINE_UTIL additions conditional, since dnl # their values may not be set when running: dnl # dnl # ./configure --with-config=srpm dnl # AS_IF([test -n "$dracutdir" ], [ RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_dracutdir $(dracutdir)"' ]) AS_IF([test -n "$udevdir" ], [ RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_udevdir $(udevdir)"' ]) AS_IF([test -n "$udevruledir" ], [ RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_udevruledir $(udevruledir)"' ]) + AS_IF([test -n "$bashcompletiondir" ], [ + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_bashcompletiondir $(bashcompletiondir)"' + ]) RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_SYSTEMD)' RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYZFS)' RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PAM)' RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYTHON_VERSION)' RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYTHON_PKG_VERSION)' dnl # Override default lib directory on Debian/Ubuntu systems. The dnl # provided /usr/lib/rpm/platform//macros files do not dnl # specify the correct path for multiarch systems as described dnl # by the packaging guidelines. dnl # dnl # https://wiki.ubuntu.com/MultiarchSpec dnl # https://wiki.debian.org/Multiarch/Implementation dnl # AS_IF([test "$DEFAULT_PACKAGE" = "deb"], [ MULTIARCH_LIBDIR="lib/$(dpkg-architecture -qDEB_HOST_MULTIARCH)" RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_lib $(MULTIARCH_LIBDIR)"' AC_SUBST(MULTIARCH_LIBDIR) ]) dnl # Make RPM_DEFINE_KMOD additions conditional on CONFIG_KERNEL, dnl # since the values will not be set otherwise. The spec files dnl # provide defaults for them. dnl # RPM_DEFINE_KMOD='--define "_wrong_version_format_terminate_build 0"' AM_COND_IF([CONFIG_KERNEL], [ RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernels $(LINUX_VERSION)"' RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "ksrc $(LINUX)"' RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kobj $(LINUX_OBJ)"' RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_cc KERNEL_CC=$(KERNEL_CC)"' RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_ld KERNEL_LD=$(KERNEL_LD)"' RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_llvm KERNEL_LLVM=$(KERNEL_LLVM)"' ]) RPM_DEFINE_DKMS='' SRPM_DEFINE_COMMON='--define "build_src_rpm 1"' SRPM_DEFINE_UTIL= SRPM_DEFINE_KMOD= SRPM_DEFINE_DKMS= RPM_SPEC_DIR="rpm/generic" AC_ARG_WITH([spec], AS_HELP_STRING([--with-spec=SPEC], [Spec files 'generic|redhat']), [RPM_SPEC_DIR="rpm/$withval"]) AC_MSG_CHECKING([whether spec files are available]) AC_MSG_RESULT([yes ($RPM_SPEC_DIR/*.spec.in)]) AC_SUBST(HAVE_RPM) AC_SUBST(RPM) AC_SUBST(RPM_VERSION) AC_SUBST(HAVE_RPMBUILD) AC_SUBST(RPMBUILD) AC_SUBST(RPMBUILD_VERSION) AC_SUBST(RPM_SPEC_DIR) AC_SUBST(RPM_DEFINE_UTIL) AC_SUBST(RPM_DEFINE_KMOD) AC_SUBST(RPM_DEFINE_DKMS) AC_SUBST(RPM_DEFINE_COMMON) AC_SUBST(SRPM_DEFINE_UTIL) AC_SUBST(SRPM_DEFINE_KMOD) AC_SUBST(SRPM_DEFINE_DKMS) AC_SUBST(SRPM_DEFINE_COMMON) ]) dnl # dnl # Check for dpkg+dpkg-buildpackage to build DEB packages. If these dnl # tools are missing it is non-fatal but you will not be able to build dnl # DEB packages and will be warned if you try too. dnl # AC_DEFUN([ZFS_AC_DPKG], [ DPKG=dpkg DPKGBUILD=dpkg-buildpackage AC_MSG_CHECKING([whether $DPKG is available]) AS_IF([tmp=$($DPKG --version 2>/dev/null)], [ DPKG_VERSION=$(echo $tmp | $AWK '/Debian/ { print $[7] }') HAVE_DPKG=yes AC_MSG_RESULT([$HAVE_DPKG ($DPKG_VERSION)]) ],[ HAVE_DPKG=no AC_MSG_RESULT([$HAVE_DPKG]) ]) AC_MSG_CHECKING([whether $DPKGBUILD is available]) AS_IF([tmp=$($DPKGBUILD --version 2>/dev/null)], [ DPKGBUILD_VERSION=$(echo $tmp | \ $AWK '/Debian/ { print $[4] }' | cut -f-4 -d'.') HAVE_DPKGBUILD=yes AC_MSG_RESULT([$HAVE_DPKGBUILD ($DPKGBUILD_VERSION)]) ],[ HAVE_DPKGBUILD=no AC_MSG_RESULT([$HAVE_DPKGBUILD]) ]) AC_SUBST(HAVE_DPKG) AC_SUBST(DPKG) AC_SUBST(DPKG_VERSION) AC_SUBST(HAVE_DPKGBUILD) AC_SUBST(DPKGBUILD) AC_SUBST(DPKGBUILD_VERSION) AC_SUBST([CFGOPTS], ["$CFGOPTS"]) ]) dnl # dnl # Until native packaging for various different packing systems dnl # can be added the least we can do is attempt to use alien to dnl # convert the RPM packages to the needed package type. This is dnl # a hack but so far it has worked reasonable well. dnl # AC_DEFUN([ZFS_AC_ALIEN], [ ALIEN=alien AC_MSG_CHECKING([whether $ALIEN is available]) AS_IF([tmp=$($ALIEN --version 2>/dev/null)], [ ALIEN_VERSION=$(echo $tmp | $AWK '{ print $[3] }') ALIEN_MAJOR=$(echo ${ALIEN_VERSION} | $AWK -F'.' '{ print $[1] }') ALIEN_MINOR=$(echo ${ALIEN_VERSION} | $AWK -F'.' '{ print $[2] }') ALIEN_POINT=$(echo ${ALIEN_VERSION} | $AWK -F'.' '{ print $[3] }') HAVE_ALIEN=yes AC_MSG_RESULT([$HAVE_ALIEN ($ALIEN_VERSION)]) ],[ HAVE_ALIEN=no AC_MSG_RESULT([$HAVE_ALIEN]) ]) AC_SUBST(HAVE_ALIEN) AC_SUBST(ALIEN) AC_SUBST(ALIEN_VERSION) AC_SUBST(ALIEN_MAJOR) AC_SUBST(ALIEN_MINOR) AC_SUBST(ALIEN_POINT) ]) dnl # dnl # Using the VENDOR tag from config.guess set the default dnl # package type for 'make pkg': (rpm | deb | tgz) dnl # AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([os distribution]) AC_ARG_WITH([vendor], [AS_HELP_STRING([--with-vendor], [Distribution vendor @<:@default=check@:>@])], [with_vendor=$withval], [with_vendor=check]) AS_IF([test "x$with_vendor" = "xcheck"],[ if test -f /etc/toss-release ; then VENDOR=toss ; elif test -f /etc/fedora-release ; then VENDOR=fedora ; elif test -f /etc/redhat-release ; then VENDOR=redhat ; elif test -f /etc/gentoo-release ; then VENDOR=gentoo ; elif test -f /etc/arch-release ; then VENDOR=arch ; elif test -f /etc/SuSE-release ; then VENDOR=sles ; elif test -f /etc/slackware-version ; then VENDOR=slackware ; elif test -f /etc/lunar.release ; then VENDOR=lunar ; elif test -f /etc/lsb-release ; then VENDOR=ubuntu ; elif test -f /etc/debian_version ; then VENDOR=debian ; elif test -f /etc/alpine-release ; then VENDOR=alpine ; elif test -f /bin/freebsd-version ; then VENDOR=freebsd ; elif test -f /etc/openEuler-release ; then VENDOR=openeuler ; else VENDOR= ; fi], [ test "x${with_vendor}" != x],[ VENDOR="$with_vendor" ], [ VENDOR= ; ] ) AC_MSG_RESULT([$VENDOR]) AC_SUBST(VENDOR) AC_MSG_CHECKING([default package type]) case "$VENDOR" in toss) DEFAULT_PACKAGE=rpm ;; redhat) DEFAULT_PACKAGE=rpm ;; fedora) DEFAULT_PACKAGE=rpm ;; gentoo) DEFAULT_PACKAGE=tgz ;; alpine) DEFAULT_PACKAGE=tgz ;; arch) DEFAULT_PACKAGE=tgz ;; sles) DEFAULT_PACKAGE=rpm ;; slackware) DEFAULT_PACKAGE=tgz ;; lunar) DEFAULT_PACKAGE=tgz ;; ubuntu) DEFAULT_PACKAGE=deb ;; debian) DEFAULT_PACKAGE=deb ;; freebsd) DEFAULT_PACKAGE=pkg ;; openeuler) DEFAULT_PACKAGE=rpm ;; *) DEFAULT_PACKAGE=rpm ;; esac AC_MSG_RESULT([$DEFAULT_PACKAGE]) AC_SUBST(DEFAULT_PACKAGE) AC_MSG_CHECKING([default init directory]) case "$VENDOR" in freebsd) initdir=$sysconfdir/rc.d ;; *) initdir=$sysconfdir/init.d;; esac AC_MSG_RESULT([$initdir]) AC_SUBST(initdir) AC_MSG_CHECKING([default shell]) case "$VENDOR" in gentoo) DEFAULT_INIT_SHELL="/sbin/openrc-run";; alpine) DEFAULT_INIT_SHELL="/sbin/openrc-run";; *) DEFAULT_INIT_SHELL="/bin/sh" ;; esac AC_MSG_RESULT([$DEFAULT_INIT_SHELL]) AC_SUBST(DEFAULT_INIT_SHELL) AC_MSG_CHECKING([default nfs server init script]) AS_IF([test "$VENDOR" = "debian"], [DEFAULT_INIT_NFS_SERVER="nfs-kernel-server"], [DEFAULT_INIT_NFS_SERVER="nfs"] ) AC_MSG_RESULT([$DEFAULT_INIT_NFS_SERVER]) AC_SUBST(DEFAULT_INIT_NFS_SERVER) AC_MSG_CHECKING([default init config directory]) case "$VENDOR" in alpine) initconfdir=/etc/conf.d ;; gentoo) initconfdir=/etc/conf.d ;; toss) initconfdir=/etc/sysconfig ;; redhat) initconfdir=/etc/sysconfig ;; fedora) initconfdir=/etc/sysconfig ;; sles) initconfdir=/etc/sysconfig ;; openeuler) initconfdir=/etc/sysconfig ;; ubuntu) initconfdir=/etc/default ;; debian) initconfdir=/etc/default ;; freebsd) initconfdir=$sysconfdir/rc.conf.d;; *) initconfdir=/etc/default ;; esac AC_MSG_RESULT([$initconfdir]) AC_SUBST(initconfdir) AC_MSG_CHECKING([whether initramfs-tools is available]) if test -d /usr/share/initramfs-tools ; then RPM_DEFINE_INITRAMFS='--define "_initramfs 1"' AC_MSG_RESULT([yes]) else RPM_DEFINE_INITRAMFS='' AC_MSG_RESULT([no]) fi AC_SUBST(RPM_DEFINE_INITRAMFS) AC_MSG_CHECKING([default bash completion directory]) case "$VENDOR" in ubuntu) bashcompletiondir=/usr/share/bash-completion/completions ;; debian) bashcompletiondir=/usr/share/bash-completion/completions ;; freebsd) bashcompletiondir=$sysconfdir/bash_completion.d;; *) bashcompletiondir=/etc/bash_completion.d ;; esac AC_MSG_RESULT([$bashcompletiondir]) AC_SUBST(bashcompletiondir) ]) dnl # dnl # Default ZFS package configuration dnl # AC_DEFUN([ZFS_AC_PACKAGE], [ ZFS_AC_DEFAULT_PACKAGE AS_IF([test x$VENDOR != xfreebsd], [ ZFS_AC_RPM ZFS_AC_DPKG ZFS_AC_ALIEN ]) ]) diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_arc.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_arc.h index c494f48bb48b..f749223daa72 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_arc.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_arc.h @@ -1,408 +1,404 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #include #if defined(_KERNEL) #if defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs #undef TRACE_SYSTEM_VAR #define TRACE_SYSTEM_VAR zfs_arc #if !defined(_TRACE_ARC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_ARC_H #include #include #include /* For ZIO macros */ /* * Generic support for one argument tracepoints of the form: * * DTRACE_PROBE1(..., * arc_buf_hdr_t *, ...); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, TP_PROTO(arc_buf_hdr_t *ab), TP_ARGS(ab), TP_STRUCT__entry( __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) __field(uint32_t, hdr_flags) - __field(uint32_t, hdr_bufcnt) __field(arc_buf_contents_t, hdr_type) __field(uint16_t, hdr_psize) __field(uint16_t, hdr_lsize) __field(uint64_t, hdr_spa) __field(arc_state_type_t, hdr_state_type) __field(clock_t, hdr_access) __field(uint32_t, hdr_mru_hits) __field(uint32_t, hdr_mru_ghost_hits) __field(uint32_t, hdr_mfu_hits) __field(uint32_t, hdr_mfu_ghost_hits) __field(uint32_t, hdr_l2_hits) __field(int64_t, hdr_refcount) ), TP_fast_assign( __entry->hdr_dva_word[0] = ab->b_dva.dva_word[0]; __entry->hdr_dva_word[1] = ab->b_dva.dva_word[1]; __entry->hdr_birth = ab->b_birth; __entry->hdr_flags = ab->b_flags; - __entry->hdr_bufcnt = ab->b_l1hdr.b_bufcnt; __entry->hdr_psize = ab->b_psize; __entry->hdr_lsize = ab->b_lsize; __entry->hdr_spa = ab->b_spa; __entry->hdr_state_type = ab->b_l1hdr.b_state->arcs_state; __entry->hdr_access = ab->b_l1hdr.b_arc_access; __entry->hdr_mru_hits = ab->b_l1hdr.b_mru_hits; __entry->hdr_mru_ghost_hits = ab->b_l1hdr.b_mru_ghost_hits; __entry->hdr_mfu_hits = ab->b_l1hdr.b_mfu_hits; __entry->hdr_mfu_ghost_hits = ab->b_l1hdr.b_mfu_ghost_hits; __entry->hdr_l2_hits = ab->b_l2hdr.b_hits; __entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count; ), TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " - "flags 0x%x bufcnt %u type %u psize %u lsize %u spa %llu " + "flags 0x%x type %u psize %u lsize %u spa %llu " "state_type %u access %lu mru_hits %u mru_ghost_hits %u " "mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], __entry->hdr_birth, __entry->hdr_flags, - __entry->hdr_bufcnt, __entry->hdr_type, __entry->hdr_psize, + __entry->hdr_type, __entry->hdr_psize, __entry->hdr_lsize, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits, __entry->hdr_l2_hits, __entry->hdr_refcount) ); /* END CSTYLED */ #define DEFINE_ARC_BUF_HDR_EVENT(name) \ DEFINE_EVENT(zfs_arc_buf_hdr_class, name, \ TP_PROTO(arc_buf_hdr_t *ab), \ TP_ARGS(ab)) DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__hit); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__iohit); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete); DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru); DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu); DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__uncached); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__async__upgrade__sync); DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit); DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss); /* * Generic support for two argument tracepoints of the form: * * DTRACE_PROBE2(..., * vdev_t *, ..., * zio_t *, ...); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_l2arc_rw_class, TP_PROTO(vdev_t *vd, zio_t *zio), TP_ARGS(vd, zio), TP_STRUCT__entry( __field(uint64_t, vdev_id) __field(uint64_t, vdev_guid) __field(uint64_t, vdev_state) ZIO_TP_STRUCT_ENTRY ), TP_fast_assign( __entry->vdev_id = vd->vdev_id; __entry->vdev_guid = vd->vdev_guid; __entry->vdev_state = vd->vdev_state; ZIO_TP_FAST_ASSIGN ), TP_printk("vdev { id %llu guid %llu state %llu } " ZIO_TP_PRINTK_FMT, __entry->vdev_id, __entry->vdev_guid, __entry->vdev_state, ZIO_TP_PRINTK_ARGS) ); /* END CSTYLED */ #define DEFINE_L2ARC_RW_EVENT(name) \ DEFINE_EVENT(zfs_l2arc_rw_class, name, \ TP_PROTO(vdev_t *vd, zio_t *zio), \ TP_ARGS(vd, zio)) DEFINE_L2ARC_RW_EVENT(zfs_l2arc__read); DEFINE_L2ARC_RW_EVENT(zfs_l2arc__write); /* * Generic support for two argument tracepoints of the form: * * DTRACE_PROBE2(..., * zio_t *, ..., * l2arc_write_callback_t *, ...); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_l2arc_iodone_class, TP_PROTO(zio_t *zio, l2arc_write_callback_t *cb), TP_ARGS(zio, cb), TP_STRUCT__entry(ZIO_TP_STRUCT_ENTRY), TP_fast_assign(ZIO_TP_FAST_ASSIGN), TP_printk(ZIO_TP_PRINTK_FMT, ZIO_TP_PRINTK_ARGS) ); /* END CSTYLED */ #define DEFINE_L2ARC_IODONE_EVENT(name) \ DEFINE_EVENT(zfs_l2arc_iodone_class, name, \ TP_PROTO(zio_t *zio, l2arc_write_callback_t *cb), \ TP_ARGS(zio, cb)) DEFINE_L2ARC_IODONE_EVENT(zfs_l2arc__iodone); /* * Generic support for four argument tracepoints of the form: * * DTRACE_PROBE4(..., * arc_buf_hdr_t *, ..., * const blkptr_t *, * uint64_t, * const zbookmark_phys_t *); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_arc_miss_class, TP_PROTO(arc_buf_hdr_t *hdr, const blkptr_t *bp, uint64_t size, const zbookmark_phys_t *zb), TP_ARGS(hdr, bp, size, zb), TP_STRUCT__entry( __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) __field(uint32_t, hdr_flags) - __field(uint32_t, hdr_bufcnt) __field(arc_buf_contents_t, hdr_type) __field(uint16_t, hdr_psize) __field(uint16_t, hdr_lsize) __field(uint64_t, hdr_spa) __field(arc_state_type_t, hdr_state_type) __field(clock_t, hdr_access) __field(uint32_t, hdr_mru_hits) __field(uint32_t, hdr_mru_ghost_hits) __field(uint32_t, hdr_mfu_hits) __field(uint32_t, hdr_mfu_ghost_hits) __field(uint32_t, hdr_l2_hits) __field(int64_t, hdr_refcount) __array(uint64_t, bp_dva0, 2) __array(uint64_t, bp_dva1, 2) __array(uint64_t, bp_dva2, 2) __array(uint64_t, bp_cksum, 4) __field(uint64_t, bp_lsize) __field(uint64_t, zb_objset) __field(uint64_t, zb_object) __field(int64_t, zb_level) __field(uint64_t, zb_blkid) ), TP_fast_assign( __entry->hdr_dva_word[0] = hdr->b_dva.dva_word[0]; __entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1]; __entry->hdr_birth = hdr->b_birth; __entry->hdr_flags = hdr->b_flags; - __entry->hdr_bufcnt = hdr->b_l1hdr.b_bufcnt; __entry->hdr_psize = hdr->b_psize; __entry->hdr_lsize = hdr->b_lsize; __entry->hdr_spa = hdr->b_spa; __entry->hdr_state_type = hdr->b_l1hdr.b_state->arcs_state; __entry->hdr_access = hdr->b_l1hdr.b_arc_access; __entry->hdr_mru_hits = hdr->b_l1hdr.b_mru_hits; __entry->hdr_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; __entry->hdr_mfu_hits = hdr->b_l1hdr.b_mfu_hits; __entry->hdr_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; __entry->hdr_l2_hits = hdr->b_l2hdr.b_hits; __entry->hdr_refcount = hdr->b_l1hdr.b_refcnt.rc_count; __entry->bp_dva0[0] = bp->blk_dva[0].dva_word[0]; __entry->bp_dva0[1] = bp->blk_dva[0].dva_word[1]; __entry->bp_dva1[0] = bp->blk_dva[1].dva_word[0]; __entry->bp_dva1[1] = bp->blk_dva[1].dva_word[1]; __entry->bp_dva2[0] = bp->blk_dva[2].dva_word[0]; __entry->bp_dva2[1] = bp->blk_dva[2].dva_word[1]; __entry->bp_cksum[0] = bp->blk_cksum.zc_word[0]; __entry->bp_cksum[1] = bp->blk_cksum.zc_word[1]; __entry->bp_cksum[2] = bp->blk_cksum.zc_word[2]; __entry->bp_cksum[3] = bp->blk_cksum.zc_word[3]; __entry->bp_lsize = size; __entry->zb_objset = zb->zb_objset; __entry->zb_object = zb->zb_object; __entry->zb_level = zb->zb_level; __entry->zb_blkid = zb->zb_blkid; ), TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " - "flags 0x%x bufcnt %u psize %u lsize %u spa %llu state_type %u " + "flags 0x%x psize %u lsize %u spa %llu state_type %u " "access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u " "mfu_ghost_hits %u l2_hits %u refcount %lli } " "bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 " "0x%llx:0x%llx cksum 0x%llx:0x%llx:0x%llx:0x%llx " "lsize %llu } zb { objset %llu object %llu level %lli " "blkid %llu }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], __entry->hdr_birth, __entry->hdr_flags, - __entry->hdr_bufcnt, __entry->hdr_psize, __entry->hdr_lsize, + __entry->hdr_psize, __entry->hdr_lsize, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits, __entry->hdr_l2_hits, __entry->hdr_refcount, __entry->bp_dva0[0], __entry->bp_dva0[1], __entry->bp_dva1[0], __entry->bp_dva1[1], __entry->bp_dva2[0], __entry->bp_dva2[1], __entry->bp_cksum[0], __entry->bp_cksum[1], __entry->bp_cksum[2], __entry->bp_cksum[3], __entry->bp_lsize, __entry->zb_objset, __entry->zb_object, __entry->zb_level, __entry->zb_blkid) ); /* END CSTYLED */ #define DEFINE_ARC_MISS_EVENT(name) \ DEFINE_EVENT(zfs_arc_miss_class, name, \ TP_PROTO(arc_buf_hdr_t *hdr, \ const blkptr_t *bp, uint64_t size, const zbookmark_phys_t *zb), \ TP_ARGS(hdr, bp, size, zb)) DEFINE_ARC_MISS_EVENT(zfs_arc__miss); /* * Generic support for four argument tracepoints of the form: * * DTRACE_PROBE4(..., * l2arc_dev_t *, ..., * list_t *, ..., * uint64_t, ..., * boolean_t, ...); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_l2arc_evict_class, TP_PROTO(l2arc_dev_t *dev, list_t *buflist, uint64_t taddr, boolean_t all), TP_ARGS(dev, buflist, taddr, all), TP_STRUCT__entry( __field(uint64_t, vdev_id) __field(uint64_t, vdev_guid) __field(uint64_t, vdev_state) __field(uint64_t, l2ad_hand) __field(uint64_t, l2ad_start) __field(uint64_t, l2ad_end) __field(boolean_t, l2ad_first) __field(boolean_t, l2ad_writing) __field(uint64_t, taddr) __field(boolean_t, all) ), TP_fast_assign( __entry->vdev_id = dev->l2ad_vdev->vdev_id; __entry->vdev_guid = dev->l2ad_vdev->vdev_guid; __entry->vdev_state = dev->l2ad_vdev->vdev_state; __entry->l2ad_hand = dev->l2ad_hand; __entry->l2ad_start = dev->l2ad_start; __entry->l2ad_end = dev->l2ad_end; __entry->l2ad_first = dev->l2ad_first; __entry->l2ad_writing = dev->l2ad_writing; __entry->taddr = taddr; __entry->all = all; ), TP_printk("l2ad { vdev { id %llu guid %llu state %llu } " "hand %llu start %llu end %llu " "first %d writing %d } taddr %llu all %d", __entry->vdev_id, __entry->vdev_guid, __entry->vdev_state, __entry->l2ad_hand, __entry->l2ad_start, __entry->l2ad_end, __entry->l2ad_first, __entry->l2ad_writing, __entry->taddr, __entry->all) ); /* END CSTYLED */ #define DEFINE_L2ARC_EVICT_EVENT(name) \ DEFINE_EVENT(zfs_l2arc_evict_class, name, \ TP_PROTO(l2arc_dev_t *dev, list_t *buflist, uint64_t taddr, boolean_t all),\ TP_ARGS(dev, buflist, taddr, all)) DEFINE_L2ARC_EVICT_EVENT(zfs_l2arc__evict); /* * Generic support for three argument tracepoints of the form: * * DTRACE_PROBE3(..., * uint64_t, ..., * uint64_t, ..., * uint64_t, ...); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_arc_wait_for_eviction_class, TP_PROTO(uint64_t amount, uint64_t arc_evict_count, uint64_t aew_count), TP_ARGS(amount, arc_evict_count, aew_count), TP_STRUCT__entry( __field(uint64_t, amount) __field(uint64_t, arc_evict_count) __field(uint64_t, aew_count) ), TP_fast_assign( __entry->amount = amount; __entry->arc_evict_count = arc_evict_count; __entry->aew_count = aew_count; ), TP_printk("amount %llu arc_evict_count %llu aew_count %llu", __entry->amount, __entry->arc_evict_count, __entry->aew_count) ); /* END CSTYLED */ #define DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(name) \ DEFINE_EVENT(zfs_arc_wait_for_eviction_class, name, \ TP_PROTO(uint64_t amount, uint64_t arc_evict_count, uint64_t aew_count), \ TP_ARGS(amount, arc_evict_count, aew_count)) DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(zfs_arc__wait__for__eviction); #endif /* _TRACE_ARC_H */ #undef TRACE_INCLUDE_PATH #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_PATH sys #define TRACE_INCLUDE_FILE trace_arc #include #else DEFINE_DTRACE_PROBE1(arc__hit); DEFINE_DTRACE_PROBE1(arc__iohit); DEFINE_DTRACE_PROBE1(arc__evict); DEFINE_DTRACE_PROBE1(arc__delete); DEFINE_DTRACE_PROBE1(new_state__mru); DEFINE_DTRACE_PROBE1(new_state__mfu); DEFINE_DTRACE_PROBE1(new_state__uncached); DEFINE_DTRACE_PROBE1(arc__async__upgrade__sync); DEFINE_DTRACE_PROBE1(l2arc__hit); DEFINE_DTRACE_PROBE1(l2arc__miss); DEFINE_DTRACE_PROBE2(l2arc__read); DEFINE_DTRACE_PROBE2(l2arc__write); DEFINE_DTRACE_PROBE2(l2arc__iodone); DEFINE_DTRACE_PROBE3(arc__wait__for__eviction); DEFINE_DTRACE_PROBE4(arc__miss); DEFINE_DTRACE_PROBE4(l2arc__evict); #endif /* HAVE_DECLARE_EVENT_CLASS */ #endif /* _KERNEL */ diff --git a/sys/contrib/openzfs/include/sys/arc_impl.h b/sys/contrib/openzfs/include/sys/arc_impl.h index 78774792f367..adff42c55d05 100644 --- a/sys/contrib/openzfs/include/sys/arc_impl.h +++ b/sys/contrib/openzfs/include/sys/arc_impl.h @@ -1,1097 +1,1093 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, Delphix. All rights reserved. * Copyright (c) 2013, Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2020, George Amanakis. All rights reserved. */ #ifndef _SYS_ARC_IMPL_H #define _SYS_ARC_IMPL_H #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) * ARC_mru - recently used, currently cached * ARC_mru_ghost - recently used, no longer in cache * ARC_mfu - frequently used, currently cached * ARC_mfu_ghost - frequently used, no longer in cache * ARC_uncached - uncacheable prefetch, to be evicted * ARC_l2c_only - exists in L2ARC but not other states * When there are no active references to the buffer, they are * are linked onto a list in one of these arc states. These are * the only buffers that can be evicted or deleted. Within each * state there are multiple lists, one for meta-data and one for * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, * etc.) is tracked separately so that it can be managed more * explicitly: favored over data, limited explicitly. * * Anonymous buffers are buffers that are not associated with * a DVA. These are buffers that hold dirty block copies * before they are written to stable storage. By definition, * they are "ref'd" and are considered part of arc_mru * that cannot be freed. Generally, they will acquire a DVA * as they are written and migrate onto the arc_mru list. * * The ARC_l2c_only state is for buffers that are in the second * level ARC but no longer in any of the ARC_m* lists. The second * level ARC itself may also contain buffers that are in any of * the ARC_m* states - meaning that a buffer can exist in two * places. The reason for the ARC_l2c_only state is to keep the * buffer header in the hash table, so that reads that hit the * second level ARC benefit from these fast lookups. */ typedef struct arc_state { /* * list of evictable buffers */ multilist_t arcs_list[ARC_BUFC_NUMTYPES]; /* * supports the "dbufs" kstat */ arc_state_type_t arcs_state; /* * total amount of data in this state. */ zfs_refcount_t arcs_size[ARC_BUFC_NUMTYPES] ____cacheline_aligned; /* * total amount of evictable data in this state */ zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; /* * amount of hit bytes for this state (counted only for ghost states) */ wmsum_t arcs_hits[ARC_BUFC_NUMTYPES]; } arc_state_t; typedef struct arc_callback arc_callback_t; struct arc_callback { void *acb_private; arc_read_done_func_t *acb_done; arc_buf_t *acb_buf; boolean_t acb_encrypted; boolean_t acb_compressed; boolean_t acb_noauth; boolean_t acb_nobuf; boolean_t acb_wait; int acb_wait_error; kmutex_t acb_wait_lock; kcondvar_t acb_wait_cv; zbookmark_phys_t acb_zb; zio_t *acb_zio_dummy; zio_t *acb_zio_head; arc_callback_t *acb_prev; arc_callback_t *acb_next; }; typedef struct arc_write_callback arc_write_callback_t; struct arc_write_callback { void *awcb_private; arc_write_done_func_t *awcb_ready; arc_write_done_func_t *awcb_children_ready; arc_write_done_func_t *awcb_done; arc_buf_t *awcb_buf; }; /* * ARC buffers are separated into multiple structs as a memory saving measure: * - Common fields struct, always defined, and embedded within it: * - L2-only fields, always allocated but undefined when not in L2ARC * - L1-only fields, only allocated when in L1ARC * * Buffer in L1 Buffer only in L2 * +------------------------+ +------------------------+ * | arc_buf_hdr_t | | arc_buf_hdr_t | * | | | | * | | | | * | | | | * +------------------------+ +------------------------+ * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | * | (undefined if L1-only) | | | * +------------------------+ +------------------------+ * | l1arc_buf_hdr_t | * | | * | | * | | * | | * +------------------------+ * * Because it's possible for the L2ARC to become extremely large, we can wind * up eating a lot of memory in L2ARC buffer headers, so the size of a header * is minimized by only allocating the fields necessary for an L1-cached buffer * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple * words in pointers. arc_hdr_realloc() is used to switch a header between * these two allocation states. */ typedef struct l1arc_buf_hdr { - /* for waiting on reads to complete */ - kcondvar_t b_cv; - uint8_t b_byteswap; - /* protected by arc state mutex */ arc_state_t *b_state; multilist_node_t b_arc_node; /* protected by hash lock */ clock_t b_arc_access; uint32_t b_mru_hits; uint32_t b_mru_ghost_hits; uint32_t b_mfu_hits; uint32_t b_mfu_ghost_hits; - uint32_t b_bufcnt; + uint8_t b_byteswap; arc_buf_t *b_buf; /* self protecting */ zfs_refcount_t b_refcnt; arc_callback_t *b_acb; abd_t *b_pabd; #ifdef ZFS_DEBUG zio_cksum_t *b_freeze_cksum; kmutex_t b_freeze_lock; #endif } l1arc_buf_hdr_t; typedef enum l2arc_dev_hdr_flags_t { L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */ } l2arc_dev_hdr_flags_t; /* * Pointer used in persistent L2ARC (for pointing to log blocks). */ typedef struct l2arc_log_blkptr { /* * Offset of log block within the device, in bytes */ uint64_t lbp_daddr; /* * Aligned payload size (in bytes) of the log block */ uint64_t lbp_payload_asize; /* * Offset in bytes of the first buffer in the payload */ uint64_t lbp_payload_start; /* * lbp_prop has the following format: * * logical size (in bytes) * * aligned (after compression) size (in bytes) * * compression algorithm (we always LZ4-compress l2arc logs) * * checksum algorithm (used for lbp_cksum) */ uint64_t lbp_prop; zio_cksum_t lbp_cksum; /* checksum of log */ } l2arc_log_blkptr_t; /* * The persistent L2ARC device header. * Byte order of magic determines whether 64-bit bswap of fields is necessary. */ typedef struct l2arc_dev_hdr_phys { uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */ uint64_t dh_version; /* Persistent L2ARC version */ /* * Global L2ARC device state and metadata. */ uint64_t dh_spa_guid; uint64_t dh_vdev_guid; uint64_t dh_log_entries; /* mirror of l2ad_log_entries */ uint64_t dh_evict; /* evicted offset in bytes */ uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */ /* * Used in zdb.c for determining if a log block is valid, in the same * way that l2arc_rebuild() does. */ uint64_t dh_start; /* mirror of l2ad_start */ uint64_t dh_end; /* mirror of l2ad_end */ /* * Start of log block chain. [0] -> newest log, [1] -> one older (used * for initiating prefetch). */ l2arc_log_blkptr_t dh_start_lbps[2]; /* * Aligned size of all log blocks as accounted by vdev_space_update(). */ uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */ uint64_t dh_lb_count; /* mirror of l2ad_lb_count */ /* * Mirrors of vdev_trim_action_time and vdev_trim_state, used to * display when the cache device was fully trimmed for the last * time. */ uint64_t dh_trim_action_time; uint64_t dh_trim_state; const uint64_t dh_pad[30]; /* pad to 512 bytes */ zio_eck_t dh_tail; } l2arc_dev_hdr_phys_t; _Static_assert(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE, "l2arc_dev_hdr_phys_t wrong size"); /* * A single ARC buffer header entry in a l2arc_log_blk_phys_t. */ typedef struct l2arc_log_ent_phys { dva_t le_dva; /* dva of buffer */ uint64_t le_birth; /* birth txg of buffer */ /* * le_prop has the following format: * * logical size (in bytes) * * physical (compressed) size (in bytes) * * compression algorithm * * object type (used to restore arc_buf_contents_t) * * protected status (used for encryption) * * prefetch status (used in l2arc_read_done()) */ uint64_t le_prop; uint64_t le_daddr; /* buf location on l2dev */ uint64_t le_complevel; /* * We pad the size of each entry to a power of 2 so that the size of * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT, * because of the L2ARC_SET_*SIZE macros. */ const uint64_t le_pad[2]; /* pad to 64 bytes */ } l2arc_log_ent_phys_t; #define L2ARC_LOG_BLK_MAX_ENTRIES (1022) /* * A log block of up to 1022 ARC buffer log entries, chained into the * persistent L2ARC metadata linked list. Byte order of magic determines * whether 64-bit bswap of fields is necessary. */ typedef struct l2arc_log_blk_phys { uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */ /* * There are 2 chains (headed by dh_start_lbps[2]), and this field * points back to the previous block in this chain. We alternate * which chain we append to, so they are time-wise and offset-wise * interleaved, but that is an optimization rather than for * correctness. */ l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */ /* * Pad header section to 128 bytes */ uint64_t lb_pad[7]; /* Payload */ l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES]; } l2arc_log_blk_phys_t; /* 64K total */ /* * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros. */ _Static_assert(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t), 1ULL << SPA_MINBLOCKSHIFT), "l2arc_log_blk_phys_t misaligned"); _Static_assert(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE, "l2arc_log_blk_phys_t too small"); _Static_assert(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE, "l2arc_log_blk_phys_t too big"); /* * These structures hold in-flight abd buffers for log blocks as they're being * written to the L2ARC device. */ typedef struct l2arc_lb_abd_buf { abd_t *abd; list_node_t node; } l2arc_lb_abd_buf_t; /* * These structures hold pointers to log blocks present on the L2ARC device. */ typedef struct l2arc_lb_ptr_buf { l2arc_log_blkptr_t *lb_ptr; list_node_t node; } l2arc_lb_ptr_buf_t; /* Macros for setting fields in le_prop and lbp_prop */ #define L2BLK_GET_LSIZE(field) \ BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1) #define L2BLK_SET_LSIZE(field, x) \ BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) #define L2BLK_GET_PSIZE(field) \ BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1) #define L2BLK_SET_PSIZE(field, x) \ BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) #define L2BLK_GET_COMPRESS(field) \ BF64_GET((field), 32, SPA_COMPRESSBITS) #define L2BLK_SET_COMPRESS(field, x) \ BF64_SET((field), 32, SPA_COMPRESSBITS, x) #define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1) #define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x) #define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8) #define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x) /* +/- 1 here are to keep compatibility after ARC_BUFC_INVALID removal. */ #define L2BLK_GET_TYPE(field) (BF64_GET((field), 48, 8) - 1) #define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, (x) + 1) #define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1) #define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x) #define L2BLK_GET_STATE(field) BF64_GET((field), 57, 4) #define L2BLK_SET_STATE(field, x) BF64_SET((field), 57, 4, x) #define PTR_SWAP(x, y) \ do { \ void *tmp = (x);\ x = y; \ y = tmp; \ } while (0) #define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */ #define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */ /* * L2ARC Internals */ typedef struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ spa_t *l2ad_spa; /* spa */ uint64_t l2ad_hand; /* next write location */ uint64_t l2ad_start; /* first addr on device */ uint64_t l2ad_end; /* last addr on device */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ kmutex_t l2ad_mtx; /* lock for buffer list */ list_t l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ zfs_refcount_t l2ad_alloc; /* allocated bytes */ /* * Persistence-related stuff */ l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */ uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */ l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */ int l2ad_log_ent_idx; /* index into cur log blk */ /* Number of bytes in current log block's payload */ uint64_t l2ad_log_blk_payload_asize; /* * Offset (in bytes) of the first buffer in current log block's * payload. */ uint64_t l2ad_log_blk_payload_start; /* Flag indicating whether a rebuild is scheduled or is going on */ boolean_t l2ad_rebuild; boolean_t l2ad_rebuild_cancel; boolean_t l2ad_rebuild_began; uint64_t l2ad_log_entries; /* entries per log blk */ uint64_t l2ad_evict; /* evicted offset in bytes */ /* List of pointers to log blocks present in the L2ARC device */ list_t l2ad_lbptr_list; /* * Aligned size of all log blocks as accounted by vdev_space_update(). */ zfs_refcount_t l2ad_lb_asize; /* * Number of log blocks present on the device. */ zfs_refcount_t l2ad_lb_count; boolean_t l2ad_trim_all; /* TRIM whole device */ } l2arc_dev_t; /* * Encrypted blocks will need to be stored encrypted on the L2ARC * disk as they appear in the main pool. In order for this to work we * need to pass around the encryption parameters so they can be used * to write data to the L2ARC. This struct is only defined in the * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED * flag set. */ typedef struct arc_buf_hdr_crypt { abd_t *b_rabd; /* raw encrypted data */ - dmu_object_type_t b_ot; /* object type */ - uint32_t b_ebufcnt; /* count of encrypted buffers */ /* dsobj for looking up encryption key for l2arc encryption */ uint64_t b_dsobj; + dmu_object_type_t b_ot; /* object type */ + /* encryption parameters */ uint8_t b_salt[ZIO_DATA_SALT_LEN]; uint8_t b_iv[ZIO_DATA_IV_LEN]; /* * Technically this could be removed since we will always be able to * get the mac from the bp when we need it. However, it is inconvenient * for callers of arc code to have to pass a bp in all the time. This * also allows us to assert that L2ARC data is properly encrypted to * match the data in the main storage pool. */ uint8_t b_mac[ZIO_DATA_MAC_LEN]; } arc_buf_hdr_crypt_t; typedef struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ uint64_t b_daddr; /* disk address, offset byte */ uint32_t b_hits; arc_state_type_t b_arcs_state; list_node_t b_l2node; } l2arc_buf_hdr_t; typedef struct l2arc_write_callback { l2arc_dev_t *l2wcb_dev; /* device info */ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ /* in-flight list of log blocks */ list_t l2wcb_abd_list; } l2arc_write_callback_t; struct arc_buf_hdr { /* protected by hash lock */ dva_t b_dva; uint64_t b_birth; arc_buf_contents_t b_type; uint8_t b_complevel; uint8_t b_reserved1; /* used for 4 byte alignment */ uint16_t b_reserved2; /* used for 4 byte alignment */ arc_buf_hdr_t *b_hash_next; arc_flags_t b_flags; /* * This field stores the size of the data buffer after * compression, and is set in the arc's zio completion handlers. * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). * * While the block pointers can store up to 32MB in their psize * field, we can only store up to 32MB minus 512B. This is due * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. * a field of zeros represents 512B in the bp). We can't use a * bias of 1 since we need to reserve a psize of zero, here, to * represent holes and embedded blocks. * * This isn't a problem in practice, since the maximum size of a * buffer is limited to 16MB, so we never need to store 32MB in * this field. Even in the upstream illumos code base, the * maximum size of a buffer is limited to 16MB. */ uint16_t b_psize; /* * This field stores the size of the data buffer before * compression, and cannot change once set. It is in units * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) */ uint16_t b_lsize; /* immutable */ uint64_t b_spa; /* immutable */ /* L2ARC fields. Undefined when not in L2ARC. */ l2arc_buf_hdr_t b_l2hdr; /* L1ARC fields. Undefined when in l2arc_only state */ l1arc_buf_hdr_t b_l1hdr; /* * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED * is set and the L1 header exists. */ arc_buf_hdr_crypt_t b_crypt_hdr; }; typedef struct arc_stats { /* Number of requests that were satisfied without I/O. */ kstat_named_t arcstat_hits; /* Number of requests for which I/O was already running. */ kstat_named_t arcstat_iohits; /* Number of requests for which I/O has to be issued. */ kstat_named_t arcstat_misses; /* Same three, but specifically for demand data. */ kstat_named_t arcstat_demand_data_hits; kstat_named_t arcstat_demand_data_iohits; kstat_named_t arcstat_demand_data_misses; /* Same three, but specifically for demand metadata. */ kstat_named_t arcstat_demand_metadata_hits; kstat_named_t arcstat_demand_metadata_iohits; kstat_named_t arcstat_demand_metadata_misses; /* Same three, but specifically for prefetch data. */ kstat_named_t arcstat_prefetch_data_hits; kstat_named_t arcstat_prefetch_data_iohits; kstat_named_t arcstat_prefetch_data_misses; /* Same three, but specifically for prefetch metadata. */ kstat_named_t arcstat_prefetch_metadata_hits; kstat_named_t arcstat_prefetch_metadata_iohits; kstat_named_t arcstat_prefetch_metadata_misses; kstat_named_t arcstat_mru_hits; kstat_named_t arcstat_mru_ghost_hits; kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_uncached_hits; kstat_named_t arcstat_deleted; /* * Number of buffers that could not be evicted because the hash lock * was held by another thread. The lock may not necessarily be held * by something using the same buffer, since hash locks are shared * by multiple buffers. */ kstat_named_t arcstat_mutex_miss; /* * Number of buffers skipped when updating the access state due to the * header having already been released after acquiring the hash lock. */ kstat_named_t arcstat_access_skip; /* * Number of buffers skipped because they have I/O in progress, are * indirect prefetch buffers that have not lived long enough, or are * not from the spa we're trying to evict from. */ kstat_named_t arcstat_evict_skip; /* * Number of times arc_evict_state() was unable to evict enough * buffers to reach its target amount. */ kstat_named_t arcstat_evict_not_enough; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; kstat_named_t arcstat_evict_l2_eligible_mfu; kstat_named_t arcstat_evict_l2_eligible_mru; kstat_named_t arcstat_evict_l2_ineligible; kstat_named_t arcstat_evict_l2_skip; kstat_named_t arcstat_hash_elements; kstat_named_t arcstat_hash_elements_max; kstat_named_t arcstat_hash_collisions; kstat_named_t arcstat_hash_chains; kstat_named_t arcstat_hash_chain_max; kstat_named_t arcstat_meta; kstat_named_t arcstat_pd; kstat_named_t arcstat_pm; kstat_named_t arcstat_c; kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; /* * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. * Note that the compressed bytes may match the uncompressed bytes * if the block is either not compressed or compressed arc is disabled. */ kstat_named_t arcstat_compressed_size; /* * Uncompressed size of the data stored in b_pabd. If compressed * arc is disabled then this value will be identical to the stat * above. */ kstat_named_t arcstat_uncompressed_size; /* * Number of bytes stored in all the arc_buf_t's. This is classified * as "overhead" since this data is typically short-lived and will * be evicted from the arc when it becomes unreferenced unless the * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level * values have been set (see comment in dbuf.c for more information). */ kstat_named_t arcstat_overhead_size; /* * Number of bytes consumed by internal ARC structures necessary * for tracking purposes; these structures are not actually * backed by ARC buffers. This includes arc_buf_hdr_t structures * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only * caches), and arc_buf_t structures (allocated via arc_buf_t * cache). */ kstat_named_t arcstat_hdr_size; /* * Number of bytes consumed by ARC buffers of type equal to * ARC_BUFC_DATA. This is generally consumed by buffers backing * on disk user data (e.g. plain file contents). */ kstat_named_t arcstat_data_size; /* * Number of bytes consumed by ARC buffers of type equal to * ARC_BUFC_METADATA. This is generally consumed by buffers * backing on disk data that is used for internal ZFS * structures (e.g. ZAP, dnode, indirect blocks, etc). */ kstat_named_t arcstat_metadata_size; /* * Number of bytes consumed by dmu_buf_impl_t objects. */ kstat_named_t arcstat_dbuf_size; /* * Number of bytes consumed by dnode_t objects. */ kstat_named_t arcstat_dnode_size; /* * Number of bytes consumed by bonus buffers. */ kstat_named_t arcstat_bonus_size; #if defined(COMPAT_FREEBSD11) /* * Sum of the previous three counters, provided for compatibility. */ kstat_named_t arcstat_other_size; #endif /* * Total number of bytes consumed by ARC buffers residing in the * arc_anon state. This includes *all* buffers in the arc_anon * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_anon_size; kstat_named_t arcstat_anon_data; kstat_named_t arcstat_anon_metadata; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, * residing in the arc_anon state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_anon_evictable_data; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_METADATA, * residing in the arc_anon state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_anon_evictable_metadata; /* * Total number of bytes consumed by ARC buffers residing in the * arc_mru state. This includes *all* buffers in the arc_mru * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_mru_size; kstat_named_t arcstat_mru_data; kstat_named_t arcstat_mru_metadata; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, * residing in the arc_mru state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_mru_evictable_data; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_METADATA, * residing in the arc_mru state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_mru_evictable_metadata; /* * Total number of bytes that *would have been* consumed by ARC * buffers in the arc_mru_ghost state. The key thing to note * here, is the fact that this size doesn't actually indicate * RAM consumption. The ghost lists only consist of headers and * don't actually have ARC buffers linked off of these headers. * Thus, *if* the headers had associated ARC buffers, these * buffers *would have* consumed this number of bytes. */ kstat_named_t arcstat_mru_ghost_size; kstat_named_t arcstat_mru_ghost_data; kstat_named_t arcstat_mru_ghost_metadata; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mru_ghost_evictable_data; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mru_ghost_evictable_metadata; /* * Total number of bytes consumed by ARC buffers residing in the * arc_mfu state. This includes *all* buffers in the arc_mfu * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_mfu_size; kstat_named_t arcstat_mfu_data; kstat_named_t arcstat_mfu_metadata; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu * state. */ kstat_named_t arcstat_mfu_evictable_data; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_METADATA, and reside in the * arc_mfu state. */ kstat_named_t arcstat_mfu_evictable_metadata; /* * Total number of bytes that *would have been* consumed by ARC * buffers in the arc_mfu_ghost state. See the comment above * arcstat_mru_ghost_size for more details. */ kstat_named_t arcstat_mfu_ghost_size; kstat_named_t arcstat_mfu_ghost_data; kstat_named_t arcstat_mfu_ghost_metadata; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. */ kstat_named_t arcstat_mfu_ghost_evictable_data; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mfu_ghost_evictable_metadata; /* * Total number of bytes that are going to be evicted from ARC due to * ARC_FLAG_UNCACHED being set. */ kstat_named_t arcstat_uncached_size; kstat_named_t arcstat_uncached_data; kstat_named_t arcstat_uncached_metadata; /* * Number of data bytes that are going to be evicted from ARC due to * ARC_FLAG_UNCACHED being set. */ kstat_named_t arcstat_uncached_evictable_data; /* * Number of metadata bytes that that are going to be evicted from ARC * due to ARC_FLAG_UNCACHED being set. */ kstat_named_t arcstat_uncached_evictable_metadata; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; /* * Allocated size (in bytes) of L2ARC cached buffers by ARC state. */ kstat_named_t arcstat_l2_prefetch_asize; kstat_named_t arcstat_l2_mru_asize; kstat_named_t arcstat_l2_mfu_asize; /* * Allocated size (in bytes) of L2ARC cached buffers by buffer content * type. */ kstat_named_t arcstat_l2_bufc_data_asize; kstat_named_t arcstat_l2_bufc_metadata_asize; kstat_named_t arcstat_l2_feeds; kstat_named_t arcstat_l2_rw_clash; kstat_named_t arcstat_l2_read_bytes; kstat_named_t arcstat_l2_write_bytes; kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; kstat_named_t arcstat_l2_writes_lock_retry; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_lsize; kstat_named_t arcstat_l2_psize; kstat_named_t arcstat_l2_hdr_size; /* * Number of L2ARC log blocks written. These are used for restoring the * L2ARC. Updated during writing of L2ARC log blocks. */ kstat_named_t arcstat_l2_log_blk_writes; /* * Moving average of the aligned size of the L2ARC log blocks, in * bytes. Updated during L2ARC rebuild and during writing of L2ARC * log blocks. */ kstat_named_t arcstat_l2_log_blk_avg_asize; /* Aligned size of L2ARC log blocks on L2ARC devices. */ kstat_named_t arcstat_l2_log_blk_asize; /* Number of L2ARC log blocks present on L2ARC devices. */ kstat_named_t arcstat_l2_log_blk_count; /* * Moving average of the aligned size of L2ARC restored data, in bytes, * to the aligned size of their metadata in L2ARC, in bytes. * Updated during L2ARC rebuild and during writing of L2ARC log blocks. */ kstat_named_t arcstat_l2_data_to_meta_ratio; /* * Number of times the L2ARC rebuild was successful for an L2ARC device. */ kstat_named_t arcstat_l2_rebuild_success; /* * Number of times the L2ARC rebuild failed because the device header * was in an unsupported format or corrupted. */ kstat_named_t arcstat_l2_rebuild_abort_unsupported; /* * Number of times the L2ARC rebuild failed because of IO errors * while reading a log block. */ kstat_named_t arcstat_l2_rebuild_abort_io_errors; /* * Number of times the L2ARC rebuild failed because of IO errors when * reading the device header. */ kstat_named_t arcstat_l2_rebuild_abort_dh_errors; /* * Number of L2ARC log blocks which failed to be restored due to * checksum errors. */ kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors; /* * Number of times the L2ARC rebuild was aborted due to low system * memory. */ kstat_named_t arcstat_l2_rebuild_abort_lowmem; /* Logical size of L2ARC restored data, in bytes. */ kstat_named_t arcstat_l2_rebuild_size; /* Aligned size of L2ARC restored data, in bytes. */ kstat_named_t arcstat_l2_rebuild_asize; /* * Number of L2ARC log entries (buffers) that were successfully * restored in ARC. */ kstat_named_t arcstat_l2_rebuild_bufs; /* * Number of L2ARC log entries (buffers) already cached in ARC. These * were not restored again. */ kstat_named_t arcstat_l2_rebuild_bufs_precached; /* * Number of L2ARC log blocks that were restored successfully. Each * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers. */ kstat_named_t arcstat_l2_rebuild_log_blks; kstat_named_t arcstat_memory_throttle_count; kstat_named_t arcstat_memory_direct_count; kstat_named_t arcstat_memory_indirect_count; kstat_named_t arcstat_memory_all_bytes; kstat_named_t arcstat_memory_free_bytes; kstat_named_t arcstat_memory_available_bytes; kstat_named_t arcstat_no_grow; kstat_named_t arcstat_tempreserve; kstat_named_t arcstat_loaned_bytes; kstat_named_t arcstat_prune; kstat_named_t arcstat_meta_used; kstat_named_t arcstat_dnode_limit; kstat_named_t arcstat_async_upgrade_sync; /* Number of predictive prefetch requests. */ kstat_named_t arcstat_predictive_prefetch; /* Number of requests for which predictive prefetch has completed. */ kstat_named_t arcstat_demand_hit_predictive_prefetch; /* Number of requests for which predictive prefetch was running. */ kstat_named_t arcstat_demand_iohit_predictive_prefetch; /* Number of prescient prefetch requests. */ kstat_named_t arcstat_prescient_prefetch; /* Number of requests for which prescient prefetch has completed. */ kstat_named_t arcstat_demand_hit_prescient_prefetch; /* Number of requests for which prescient prefetch was running. */ kstat_named_t arcstat_demand_iohit_prescient_prefetch; kstat_named_t arcstat_need_free; kstat_named_t arcstat_sys_free; kstat_named_t arcstat_raw_size; kstat_named_t arcstat_cached_only_in_progress; kstat_named_t arcstat_abd_chunk_waste_size; } arc_stats_t; typedef struct arc_sums { wmsum_t arcstat_hits; wmsum_t arcstat_iohits; wmsum_t arcstat_misses; wmsum_t arcstat_demand_data_hits; wmsum_t arcstat_demand_data_iohits; wmsum_t arcstat_demand_data_misses; wmsum_t arcstat_demand_metadata_hits; wmsum_t arcstat_demand_metadata_iohits; wmsum_t arcstat_demand_metadata_misses; wmsum_t arcstat_prefetch_data_hits; wmsum_t arcstat_prefetch_data_iohits; wmsum_t arcstat_prefetch_data_misses; wmsum_t arcstat_prefetch_metadata_hits; wmsum_t arcstat_prefetch_metadata_iohits; wmsum_t arcstat_prefetch_metadata_misses; wmsum_t arcstat_mru_hits; wmsum_t arcstat_mru_ghost_hits; wmsum_t arcstat_mfu_hits; wmsum_t arcstat_mfu_ghost_hits; wmsum_t arcstat_uncached_hits; wmsum_t arcstat_deleted; wmsum_t arcstat_mutex_miss; wmsum_t arcstat_access_skip; wmsum_t arcstat_evict_skip; wmsum_t arcstat_evict_not_enough; wmsum_t arcstat_evict_l2_cached; wmsum_t arcstat_evict_l2_eligible; wmsum_t arcstat_evict_l2_eligible_mfu; wmsum_t arcstat_evict_l2_eligible_mru; wmsum_t arcstat_evict_l2_ineligible; wmsum_t arcstat_evict_l2_skip; wmsum_t arcstat_hash_collisions; wmsum_t arcstat_hash_chains; aggsum_t arcstat_size; wmsum_t arcstat_compressed_size; wmsum_t arcstat_uncompressed_size; wmsum_t arcstat_overhead_size; wmsum_t arcstat_hdr_size; wmsum_t arcstat_data_size; wmsum_t arcstat_metadata_size; wmsum_t arcstat_dbuf_size; wmsum_t arcstat_dnode_size; wmsum_t arcstat_bonus_size; wmsum_t arcstat_l2_hits; wmsum_t arcstat_l2_misses; wmsum_t arcstat_l2_prefetch_asize; wmsum_t arcstat_l2_mru_asize; wmsum_t arcstat_l2_mfu_asize; wmsum_t arcstat_l2_bufc_data_asize; wmsum_t arcstat_l2_bufc_metadata_asize; wmsum_t arcstat_l2_feeds; wmsum_t arcstat_l2_rw_clash; wmsum_t arcstat_l2_read_bytes; wmsum_t arcstat_l2_write_bytes; wmsum_t arcstat_l2_writes_sent; wmsum_t arcstat_l2_writes_done; wmsum_t arcstat_l2_writes_error; wmsum_t arcstat_l2_writes_lock_retry; wmsum_t arcstat_l2_evict_lock_retry; wmsum_t arcstat_l2_evict_reading; wmsum_t arcstat_l2_evict_l1cached; wmsum_t arcstat_l2_free_on_write; wmsum_t arcstat_l2_abort_lowmem; wmsum_t arcstat_l2_cksum_bad; wmsum_t arcstat_l2_io_error; wmsum_t arcstat_l2_lsize; wmsum_t arcstat_l2_psize; aggsum_t arcstat_l2_hdr_size; wmsum_t arcstat_l2_log_blk_writes; wmsum_t arcstat_l2_log_blk_asize; wmsum_t arcstat_l2_log_blk_count; wmsum_t arcstat_l2_rebuild_success; wmsum_t arcstat_l2_rebuild_abort_unsupported; wmsum_t arcstat_l2_rebuild_abort_io_errors; wmsum_t arcstat_l2_rebuild_abort_dh_errors; wmsum_t arcstat_l2_rebuild_abort_cksum_lb_errors; wmsum_t arcstat_l2_rebuild_abort_lowmem; wmsum_t arcstat_l2_rebuild_size; wmsum_t arcstat_l2_rebuild_asize; wmsum_t arcstat_l2_rebuild_bufs; wmsum_t arcstat_l2_rebuild_bufs_precached; wmsum_t arcstat_l2_rebuild_log_blks; wmsum_t arcstat_memory_throttle_count; wmsum_t arcstat_memory_direct_count; wmsum_t arcstat_memory_indirect_count; wmsum_t arcstat_prune; wmsum_t arcstat_meta_used; wmsum_t arcstat_async_upgrade_sync; wmsum_t arcstat_predictive_prefetch; wmsum_t arcstat_demand_hit_predictive_prefetch; wmsum_t arcstat_demand_iohit_predictive_prefetch; wmsum_t arcstat_prescient_prefetch; wmsum_t arcstat_demand_hit_prescient_prefetch; wmsum_t arcstat_demand_iohit_prescient_prefetch; wmsum_t arcstat_raw_size; wmsum_t arcstat_cached_only_in_progress; wmsum_t arcstat_abd_chunk_waste_size; } arc_sums_t; typedef struct arc_evict_waiter { list_node_t aew_node; kcondvar_t aew_cv; uint64_t aew_count; } arc_evict_waiter_t; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) #define ARCSTAT_INCR(stat, val) \ wmsum_add(&arc_sums.stat, (val)) #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) #define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ #define arc_meta ARCSTAT(arcstat_meta) /* target frac of metadata */ #define arc_pd ARCSTAT(arcstat_pd) /* target frac of data MRU */ #define arc_pm ARCSTAT(arcstat_pm) /* target frac of meta MRU */ #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ #define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ #define arc_anon (&ARC_anon) #define arc_mru (&ARC_mru) #define arc_mru_ghost (&ARC_mru_ghost) #define arc_mfu (&ARC_mfu) #define arc_mfu_ghost (&ARC_mfu_ghost) #define arc_l2c_only (&ARC_l2c_only) #define arc_uncached (&ARC_uncached) extern taskq_t *arc_prune_taskq; extern arc_stats_t arc_stats; extern arc_sums_t arc_sums; extern hrtime_t arc_growtime; extern boolean_t arc_warm; extern uint_t arc_grow_retry; extern uint_t arc_no_grow_shift; extern uint_t arc_shrink_shift; extern kmutex_t arc_prune_mtx; extern list_t arc_prune_list; extern arc_state_t ARC_mfu; extern arc_state_t ARC_mru; extern uint_t zfs_arc_pc_percent; extern uint_t arc_lotsfree_percent; extern uint64_t zfs_arc_min; extern uint64_t zfs_arc_max; extern void arc_reduce_target_size(int64_t to_free); extern boolean_t arc_reclaim_needed(void); extern void arc_kmem_reap_soon(void); extern void arc_wait_for_eviction(uint64_t, boolean_t); extern void arc_lowmem_init(void); extern void arc_lowmem_fini(void); extern void arc_prune_async(uint64_t); extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg); extern uint64_t arc_free_memory(void); extern int64_t arc_available_memory(void); extern void arc_tuning_update(boolean_t); extern void arc_register_hotplug(void); extern void arc_unregister_hotplug(void); extern int param_set_arc_u64(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_min(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_max(ZFS_MODULE_PARAM_ARGS); /* used in zdb.c */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp); /* used in vdev_trim.c */ void l2arc_dev_hdr_update(l2arc_dev_t *dev); l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); #ifdef __cplusplus } #endif #endif /* _SYS_ARC_IMPL_H */ diff --git a/sys/contrib/openzfs/include/sys/metaslab_impl.h b/sys/contrib/openzfs/include/sys/metaslab_impl.h index d328068890cc..4f434291ddbf 100644 --- a/sys/contrib/openzfs/include/sys/metaslab_impl.h +++ b/sys/contrib/openzfs/include/sys/metaslab_impl.h @@ -1,573 +1,572 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2011, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H #define _SYS_METASLAB_IMPL_H #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Metaslab allocation tracing record. */ typedef struct metaslab_alloc_trace { list_node_t mat_list_node; metaslab_group_t *mat_mg; metaslab_t *mat_msp; uint64_t mat_size; uint64_t mat_weight; uint32_t mat_dva_id; uint64_t mat_offset; int mat_allocator; } metaslab_alloc_trace_t; /* * Used by the metaslab allocation tracing facility to indicate * error conditions. These errors are stored to the offset member * of the metaslab_alloc_trace_t record and displayed by mdb. */ typedef enum trace_alloc_type { TRACE_ALLOC_FAILURE = -1ULL, TRACE_TOO_SMALL = -2ULL, TRACE_FORCE_GANG = -3ULL, TRACE_NOT_ALLOCATABLE = -4ULL, TRACE_GROUP_FAILURE = -5ULL, TRACE_ENOSPC = -6ULL, TRACE_CONDENSING = -7ULL, TRACE_VDEV_ERROR = -8ULL, TRACE_DISABLED = -9ULL, } trace_alloc_type_t; #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) #define METASLAB_WEIGHT_CLAIM (1ULL << 61) #define METASLAB_WEIGHT_TYPE (1ULL << 60) #define METASLAB_ACTIVE_MASK \ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \ METASLAB_WEIGHT_CLAIM) /* * The metaslab weight is used to encode the amount of free space in a * metaslab, such that the "best" metaslab appears first when sorting the * metaslabs by weight. The weight (and therefore the "best" metaslab) can * be determined in two different ways: by computing a weighted sum of all * the free space in the metaslab (a space based weight) or by counting only * the free segments of the largest size (a segment based weight). We prefer * the segment based weight because it reflects how the free space is * comprised, but we cannot always use it -- legacy pools do not have the * space map histogram information necessary to determine the largest * contiguous regions. Pools that have the space map histogram determine * the segment weight by looking at each bucket in the histogram and * determining the free space whose size in bytes is in the range: * [2^i, 2^(i+1)) * We then encode the largest index, i, that contains regions into the * segment-weighted value. * * Space-based weight: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * |PSC1| weighted-free space | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation * C - indicates activation for claimed block zio * space - the fragmentation-weighted space * * Segment-based weight: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * |PSC0| idx| count of segments in region | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation * C - indicates activation for claimed block zio * idx - index for the highest bucket in the histogram * count - number of segments in the specified bucket */ #define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3) #define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x) #define WEIGHT_IS_SPACEBASED(weight) \ ((weight) == 0 || BF64_GET((weight), 60, 1)) #define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1) /* * These macros are only applicable to segment-based weighting. */ #define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6) #define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x) #define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) #define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) /* * Per-allocator data structure. */ typedef struct metaslab_class_allocator { metaslab_group_t *mca_rotor; uint64_t mca_aliquot; /* * The allocation throttle works on a reservation system. Whenever * an asynchronous zio wants to perform an allocation it must * first reserve the number of blocks that it wants to allocate. * If there aren't sufficient slots available for the pending zio * then that I/O is throttled until more slots free up. The current * number of reserved allocations is maintained by the mca_alloc_slots * refcount. The mca_alloc_max_slots value determines the maximum * number of allocations that the system allows. Gang blocks are * allowed to reserve slots even if we've reached the maximum * number of allocations allowed. */ uint64_t mca_alloc_max_slots; zfs_refcount_t mca_alloc_slots; } ____cacheline_aligned metaslab_class_allocator_t; /* * A metaslab class encompasses a category of allocatable top-level vdevs. * Each top-level vdev is associated with a metaslab group which defines * the allocatable region for that vdev. Examples of these categories include * "normal" for data block allocations (i.e. main pool allocations) or "log" * for allocations designated for intent log devices (i.e. slog devices). * When a block allocation is requested from the SPA it is associated with a * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging * to the class can be used to satisfy that request. Allocations are done * by traversing the metaslab groups that are linked off of the mca_rotor field. * This rotor points to the next metaslab group where allocations will be * attempted. Allocating a block is a 3 step process -- select the metaslab * group, select the metaslab, and then allocate the block. The metaslab * class defines the low-level block allocator that will be used as the * final step in allocation. These allocators are pluggable allowing each class * to use a block allocator that best suits that class. */ struct metaslab_class { kmutex_t mc_lock; spa_t *mc_spa; const metaslab_ops_t *mc_ops; /* * Track the number of metaslab groups that have been initialized * and can accept allocations. An initialized metaslab group is * one has been completely added to the config (i.e. we have * updated the MOS config and the space has been added to the pool). */ uint64_t mc_groups; /* * Toggle to enable/disable the allocation throttle. */ boolean_t mc_alloc_throttle_enabled; uint64_t mc_alloc_groups; /* # of allocatable groups */ uint64_t mc_alloc; /* total allocated space */ uint64_t mc_deferred; /* total deferred frees */ uint64_t mc_space; /* total space (alloc + free) */ uint64_t mc_dspace; /* total deflated space */ uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE]; /* * List of all loaded metaslabs in the class, sorted in order of most * recent use. */ multilist_t mc_metaslab_txg_list; metaslab_class_allocator_t mc_allocator[]; }; /* * Per-allocator data structure. */ typedef struct metaslab_group_allocator { uint64_t mga_cur_max_alloc_queue_depth; zfs_refcount_t mga_alloc_queue_depth; metaslab_t *mga_primary; metaslab_t *mga_secondary; } metaslab_group_allocator_t; /* * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs) * of a top-level vdev. They are linked together to form a circular linked * list and can belong to only one metaslab class. Metaslab groups may become * ineligible for allocations for a number of reasons such as limited free * space, fragmentation, or going offline. When this happens the allocator will * simply find the next metaslab group in the linked list and attempt * to allocate from that group instead. */ struct metaslab_group { kmutex_t mg_lock; avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; boolean_t mg_allocatable; /* can we allocate? */ uint64_t mg_ms_ready; /* * A metaslab group is considered to be initialized only after * we have updated the MOS config and added the space to the pool. * We only allow allocation attempts to a metaslab group if it * has been initialized. */ boolean_t mg_initialized; uint64_t mg_free_capacity; /* percentage free */ int64_t mg_bias; int64_t mg_activation_count; metaslab_class_t *mg_class; vdev_t *mg_vd; - taskq_t *mg_taskq; metaslab_group_t *mg_prev; metaslab_group_t *mg_next; /* * In order for the allocation throttle to function properly, we cannot * have too many IOs going to each disk by default; the throttle * operates by allocating more work to disks that finish quickly, so * allocating larger chunks to each disk reduces its effectiveness. * However, if the number of IOs going to each allocator is too small, * we will not perform proper aggregation at the vdev_queue layer, * also resulting in decreased performance. Therefore, we will use a * ramp-up strategy. * * Each allocator in each metaslab group has a current queue depth * (mg_alloc_queue_depth[allocator]) and a current max queue depth * (mga_cur_max_alloc_queue_depth[allocator]), and each metaslab group * has an absolute max queue depth (mg_max_alloc_queue_depth). We * add IOs to an allocator until the mg_alloc_queue_depth for that * allocator hits the cur_max. Every time an IO completes for a given * allocator on a given metaslab group, we increment its cur_max until * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to * help protect against disks that decrease in performance over time. * * It's possible for an allocator to handle more allocations than * its max. This can occur when gang blocks are required or when other * groups are unable to handle their share of allocations. */ uint64_t mg_max_alloc_queue_depth; /* * A metalab group that can no longer allocate the minimum block * size will set mg_no_free_space. Once a metaslab group is out * of space then its share of work must be distributed to other * groups. */ boolean_t mg_no_free_space; uint64_t mg_allocations; uint64_t mg_failed_allocations; uint64_t mg_fragmentation; uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; int mg_ms_disabled; boolean_t mg_disabled_updating; kmutex_t mg_ms_disabled_lock; kcondvar_t mg_ms_disabled_cv; int mg_allocators; metaslab_group_allocator_t mg_allocator[]; }; /* * This value defines the number of elements in the ms_lbas array. The value * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. * This is the equivalent of highbit(UINT64_MAX). */ #define MAX_LBAS 64 /* * Each metaslab maintains a set of in-core trees to track metaslab * operations. The in-core free tree (ms_allocatable) contains the list of * free segments which are eligible for allocation. As blocks are * allocated, the allocated segments are removed from the ms_allocatable and * added to a per txg allocation tree (ms_allocating). As blocks are * freed, they are added to the free tree (ms_freeing). These trees * allow us to process all allocations and frees in syncing context * where it is safe to update the on-disk space maps. An additional set * of in-core trees is maintained to track deferred frees * (ms_defer). Once a block is freed it will move from the * ms_freed to the ms_defer tree. A deferred free means that a block * has been freed but cannot be used by the pool until TXG_DEFER_SIZE * transactions groups later. For example, a block that is freed in txg * 50 will not be available for reallocation until txg 52 (50 + * TXG_DEFER_SIZE). This provides a safety net for uberblock rollback. * A pool could be safely rolled back TXG_DEFERS_SIZE transactions * groups and ensure that no block has been reallocated. * * The simplified transition diagram looks like this: * * * ALLOCATE * | * V * free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map) * ^ * | ms_freeing <--- FREE * | | * | v * | ms_freed * | | * +-------- ms_defer[2] <-------+-------> (write to space map) * * * Each metaslab's space is tracked in a single space map in the MOS, * which is only updated in syncing context. Each time we sync a txg, * we append the allocs and frees from that txg to the space map. The * pool space is only updated once all metaslabs have finished syncing. * * To load the in-core free tree we read the space map from disk. This * object contains a series of alloc and free records that are combined * to make up the list of all free segments in this metaslab. These * segments are represented in-core by the ms_allocatable and are stored * in an AVL tree. * * As the space map grows (as a result of the appends) it will * eventually become space-inefficient. When the metaslab's in-core * free tree is zfs_condense_pct/100 times the size of the minimal * on-disk representation, we rewrite it in its minimized form. If a * metaslab needs to condense then we must set the ms_condensing flag to * ensure that allocations are not performed on the metaslab that is * being written. */ struct metaslab { /* * This is the main lock of the metaslab and its purpose is to * coordinate our allocations and frees [e.g., metaslab_block_alloc(), * metaslab_free_concrete(), ..etc] with our various syncing * procedures [e.g., metaslab_sync(), metaslab_sync_done(), ..etc]. * * The lock is also used during some miscellaneous operations like * using the metaslab's histogram for the metaslab group's histogram * aggregation, or marking the metaslab for initialization. */ kmutex_t ms_lock; /* * Acquired together with the ms_lock whenever we expect to * write to metaslab data on-disk (i.e flushing entries to * the metaslab's space map). It helps coordinate readers of * the metaslab's space map [see spa_vdev_remove_thread()] * with writers [see metaslab_sync() or metaslab_flush()]. * * Note that metaslab_load(), even though a reader, uses * a completely different mechanism to deal with the reading * of the metaslab's space map based on ms_synced_length. That * said, the function still uses the ms_sync_lock after it * has read the ms_sm [see relevant comment in metaslab_load() * as to why]. */ kmutex_t ms_sync_lock; kcondvar_t ms_load_cv; space_map_t *ms_sm; uint64_t ms_id; uint64_t ms_start; uint64_t ms_size; uint64_t ms_fragmentation; range_tree_t *ms_allocating[TXG_SIZE]; range_tree_t *ms_allocatable; uint64_t ms_allocated_this_txg; uint64_t ms_allocating_total; /* * The following range trees are accessed only from syncing context. * ms_free*tree only have entries while syncing, and are empty * between syncs. */ range_tree_t *ms_freeing; /* to free this syncing txg */ range_tree_t *ms_freed; /* already freed this syncing txg */ range_tree_t *ms_defer[TXG_DEFER_SIZE]; range_tree_t *ms_checkpointing; /* to add to the checkpoint */ /* * The ms_trim tree is the set of allocatable segments which are * eligible for trimming. (When the metaslab is loaded, it's a * subset of ms_allocatable.) It's kept in-core as long as the * autotrim property is set and is not vacated when the metaslab * is unloaded. Its purpose is to aggregate freed ranges to * facilitate efficient trimming. */ range_tree_t *ms_trim; boolean_t ms_condensing; /* condensing? */ boolean_t ms_condense_wanted; /* * The number of consumers which have disabled the metaslab. */ uint64_t ms_disabled; /* * We must always hold the ms_lock when modifying ms_loaded * and ms_loading. */ boolean_t ms_loaded; boolean_t ms_loading; kcondvar_t ms_flush_cv; boolean_t ms_flushing; /* * The following histograms count entries that are in the * metaslab's space map (and its histogram) but are not in * ms_allocatable yet, because they are in ms_freed, ms_freeing, * or ms_defer[]. * * When the metaslab is not loaded, its ms_weight needs to * reflect what is allocatable (i.e. what will be part of * ms_allocatable if it is loaded). The weight is computed from * the spacemap histogram, but that includes ranges that are * not yet allocatable (because they are in ms_freed, * ms_freeing, or ms_defer[]). Therefore, when calculating the * weight, we need to remove those ranges. * * The ranges in the ms_freed and ms_defer[] range trees are all * present in the spacemap. However, the spacemap may have * multiple entries to represent a contiguous range, because it * is written across multiple sync passes, but the changes of * all sync passes are consolidated into the range trees. * Adjacent ranges that are freed in different sync passes of * one txg will be represented separately (as 2 or more entries) * in the space map (and its histogram), but these adjacent * ranges will be consolidated (represented as one entry) in the * ms_freed/ms_defer[] range trees (and their histograms). * * When calculating the weight, we can not simply subtract the * range trees' histograms from the spacemap's histogram, * because the range trees' histograms may have entries in * higher buckets than the spacemap, due to consolidation. * Instead we must subtract the exact entries that were added to * the spacemap's histogram. ms_synchist and ms_deferhist[] * represent these exact entries, so we can subtract them from * the spacemap's histogram when calculating ms_weight. * * ms_synchist represents the same ranges as ms_freeing + * ms_freed, but without consolidation across sync passes. * * ms_deferhist[i] represents the same ranges as ms_defer[i], * but without consolidation across sync passes. */ uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE]; uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE]; /* * Tracks the exact amount of allocated space of this metaslab * (and specifically the metaslab's space map) up to the most * recently completed sync pass [see usage in metaslab_sync()]. */ uint64_t ms_allocated_space; int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ uint64_t ms_activation_weight; /* activation weight */ /* * Track of whenever a metaslab is selected for loading or allocation. * We use this value to determine how long the metaslab should * stay cached. */ uint64_t ms_selected_txg; /* * ms_load/unload_time can be used for performance monitoring * (e.g. by dtrace or mdb). */ hrtime_t ms_load_time; /* time last loaded */ hrtime_t ms_unload_time; /* time last unloaded */ hrtime_t ms_selected_time; /* time last allocated from */ uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ uint64_t ms_max_size; /* maximum allocatable size */ /* * -1 if it's not active in an allocator, otherwise set to the allocator * this metaslab is active for. */ int ms_allocator; boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */ /* * The metaslab block allocators can optionally use a size-ordered * range tree and/or an array of LBAs. Not all allocators use * this functionality. The ms_allocatable_by_size should always * contain the same number of segments as the ms_allocatable. The * only difference is that the ms_allocatable_by_size is ordered by * segment sizes. */ zfs_btree_t ms_allocatable_by_size; zfs_btree_t ms_unflushed_frees_by_size; uint64_t ms_lbas[MAX_LBAS]; metaslab_group_t *ms_group; /* metaslab group */ avl_node_t ms_group_node; /* node in metaslab group tree */ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */ /* * Node in metaslab class's selected txg list */ multilist_node_t ms_class_txg_node; /* * Allocs and frees that are committed to the vdev log spacemap but * not yet to this metaslab's spacemap. */ range_tree_t *ms_unflushed_allocs; range_tree_t *ms_unflushed_frees; /* * We have flushed entries up to but not including this TXG. In * other words, all changes from this TXG and onward should not * be in this metaslab's space map and must be read from the * log space maps. */ uint64_t ms_unflushed_txg; boolean_t ms_unflushed_dirty; /* updated every time we are done syncing the metaslab's space map */ uint64_t ms_synced_length; boolean_t ms_new; }; typedef struct metaslab_unflushed_phys { /* on-disk counterpart of ms_unflushed_txg */ uint64_t msp_unflushed_txg; } metaslab_unflushed_phys_t; #ifdef __cplusplus } #endif #endif /* _SYS_METASLAB_IMPL_H */ diff --git a/sys/contrib/openzfs/include/sys/spa_impl.h b/sys/contrib/openzfs/include/sys/spa_impl.h index 1a04bedc3137..094258d47a48 100644 --- a/sys/contrib/openzfs/include/sys/spa_impl.h +++ b/sys/contrib/openzfs/include/sys/spa_impl.h @@ -1,478 +1,478 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019 Datto Inc. */ #ifndef _SYS_SPA_IMPL_H #define _SYS_SPA_IMPL_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif typedef struct spa_alloc { kmutex_t spaa_lock; avl_tree_t spaa_tree; } ____cacheline_aligned spa_alloc_t; typedef struct spa_error_entry { zbookmark_phys_t se_bookmark; char *se_name; avl_node_t se_avl; zbookmark_err_phys_t se_zep; /* not accounted in avl_find */ } spa_error_entry_t; typedef struct spa_history_phys { uint64_t sh_pool_create_len; /* ending offset of zpool create */ uint64_t sh_phys_max_off; /* physical EOF */ uint64_t sh_bof; /* logical BOF */ uint64_t sh_eof; /* logical EOF */ uint64_t sh_records_lost; /* num of records overwritten */ } spa_history_phys_t; /* * All members must be uint64_t, for byteswap purposes. */ typedef struct spa_removing_phys { uint64_t sr_state; /* dsl_scan_state_t */ /* * The vdev ID that we most recently attempted to remove, * or -1 if no removal has been attempted. */ uint64_t sr_removing_vdev; /* * The vdev ID that we most recently successfully removed, * or -1 if no devices have been removed. */ uint64_t sr_prev_indirect_vdev; uint64_t sr_start_time; uint64_t sr_end_time; /* * Note that we can not use the space map's or indirect mapping's * accounting as a substitute for these values, because we need to * count frees of not-yet-copied data as though it did the copy. * Otherwise, we could get into a situation where copied > to_copy, * or we complete before copied == to_copy. */ uint64_t sr_to_copy; /* bytes that need to be copied */ uint64_t sr_copied; /* bytes that have been copied or freed */ } spa_removing_phys_t; /* * This struct is stored as an entry in the DMU_POOL_DIRECTORY_OBJECT * (with key DMU_POOL_CONDENSING_INDIRECT). It is present if a condense * of an indirect vdev's mapping object is in progress. */ typedef struct spa_condensing_indirect_phys { /* * The vdev ID of the indirect vdev whose indirect mapping is * being condensed. */ uint64_t scip_vdev; /* * The vdev's old obsolete spacemap. This spacemap's contents are * being integrated into the new mapping. */ uint64_t scip_prev_obsolete_sm_object; /* * The new mapping object that is being created. */ uint64_t scip_next_mapping_object; } spa_condensing_indirect_phys_t; struct spa_aux_vdev { uint64_t sav_object; /* MOS object for device list */ nvlist_t *sav_config; /* cached device config */ vdev_t **sav_vdevs; /* devices */ int sav_count; /* number devices */ boolean_t sav_sync; /* sync the device list */ nvlist_t **sav_pending; /* pending device additions */ uint_t sav_npending; /* # pending devices */ }; typedef struct spa_config_lock { kmutex_t scl_lock; kthread_t *scl_writer; int scl_write_wanted; int scl_count; kcondvar_t scl_cv; } ____cacheline_aligned spa_config_lock_t; typedef struct spa_config_dirent { list_node_t scd_link; char *scd_path; } spa_config_dirent_t; typedef enum zio_taskq_type { ZIO_TASKQ_ISSUE = 0, ZIO_TASKQ_ISSUE_HIGH, ZIO_TASKQ_INTERRUPT, ZIO_TASKQ_INTERRUPT_HIGH, ZIO_TASKQ_TYPES } zio_taskq_type_t; /* * State machine for the zpool-poolname process. The states transitions * are done as follows: * * From To Routine * PROC_NONE -> PROC_CREATED spa_activate() * PROC_CREATED -> PROC_ACTIVE spa_thread() * PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate() * PROC_DEACTIVATE -> PROC_GONE spa_thread() * PROC_GONE -> PROC_NONE spa_deactivate() */ typedef enum spa_proc_state { SPA_PROC_NONE, /* spa_proc = &p0, no process created */ SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */ SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */ SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */ SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */ } spa_proc_state_t; typedef struct spa_taskqs { uint_t stqs_count; taskq_t **stqs_taskq; } spa_taskqs_t; typedef enum spa_all_vdev_zap_action { AVZ_ACTION_NONE = 0, AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */ AVZ_ACTION_REBUILD, /* Populate the new AVZ, see spa_avz_rebuild */ AVZ_ACTION_INITIALIZE } spa_avz_action_t; typedef enum spa_config_source { SPA_CONFIG_SRC_NONE = 0, SPA_CONFIG_SRC_SCAN, /* scan of path (default: /dev/dsk) */ SPA_CONFIG_SRC_CACHEFILE, /* any cachefile */ SPA_CONFIG_SRC_TRYIMPORT, /* returned from call to tryimport */ SPA_CONFIG_SRC_SPLIT, /* new pool in a pool split */ SPA_CONFIG_SRC_MOS /* MOS, but not always from right txg */ } spa_config_source_t; struct spa { /* * Fields protected by spa_namespace_lock. */ char spa_name[ZFS_MAX_DATASET_NAME_LEN]; /* pool name */ char *spa_comment; /* comment */ avl_node_t spa_avl; /* node in spa_namespace_avl */ nvlist_t *spa_config; /* last synced config */ nvlist_t *spa_config_syncing; /* currently syncing config */ nvlist_t *spa_config_splitting; /* config for splitting */ nvlist_t *spa_load_info; /* info and errors from load */ uint64_t spa_config_txg; /* txg of last config change */ uint32_t spa_sync_pass; /* iterate-to-convergence */ pool_state_t spa_state; /* pool state */ int spa_inject_ref; /* injection references */ uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */ boolean_t spa_trust_config; /* do we trust vdev tree? */ boolean_t spa_is_splitting; /* in the middle of a split? */ spa_config_source_t spa_config_source; /* where config comes from? */ uint64_t spa_import_flags; /* import specific flags */ spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; boolean_t spa_is_initializing; /* true while opening pool */ boolean_t spa_is_exporting; /* true while exporting pool */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */ metaslab_class_t *spa_special_class; /* special allocation class */ metaslab_class_t *spa_dedup_class; /* dedup allocation class */ uint64_t spa_first_txg; /* first txg after spa_open() */ uint64_t spa_final_txg; /* txg of export/destroy */ uint64_t spa_freeze_txg; /* freeze pool at this txg */ uint64_t spa_load_max_txg; /* best initial ub_txg */ uint64_t spa_claim_max_txg; /* highest claimed birth txg */ inode_timespec_t spa_loaded_ts; /* 1st successful open time */ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */ list_t spa_evicting_os_list; /* Objsets being evicted. */ kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ uint64_t spa_min_ashift; /* of vdevs in normal class */ uint64_t spa_max_ashift; /* of vdevs in normal class */ uint64_t spa_min_alloc; /* of vdevs in normal class */ uint64_t spa_gcd_alloc; /* of vdevs in normal class */ uint64_t spa_config_guid; /* config pool guid */ uint64_t spa_load_guid; /* spa_load initialized guid */ uint64_t spa_last_synced_guid; /* last synced guid */ list_t spa_config_dirty_list; /* vdevs with dirty config */ list_t spa_state_dirty_list; /* vdevs with dirty state */ /* * spa_allocs is an array, whose lengths is stored in spa_alloc_count. * There is one tree and one lock for each allocator, to help improve * allocation performance in write-heavy workloads. */ spa_alloc_t *spa_allocs; int spa_alloc_count; int spa_active_allocator; /* selectable allocator */ spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ nvlist_t *spa_label_features; /* Features for reading MOS */ uint64_t spa_config_object; /* MOS object for pool config */ uint64_t spa_config_generation; /* config generation number */ uint64_t spa_syncing_txg; /* txg currently syncing */ bpobj_t spa_deferred_bpobj; /* deferred-free bplist */ bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */ zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */ /* checksum context templates */ kmutex_t spa_cksum_tmpls_lock; void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS]; uberblock_t spa_ubsync; /* last synced uberblock */ uberblock_t spa_uberblock; /* current uberblock */ boolean_t spa_extreme_rewind; /* rewind past deferred frees */ kmutex_t spa_scrub_lock; /* resilver/scrub lock */ uint64_t spa_scrub_inflight; /* in-flight scrub bytes */ /* in-flight verification bytes */ uint64_t spa_load_verify_bytes; kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ uint8_t spa_scrub_active; /* active or suspended? */ uint8_t spa_scrub_type; /* type of scrub we're doing */ uint8_t spa_scrub_finished; /* indicator to rotate logs */ uint8_t spa_scrub_started; /* started since last boot */ uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */ uint64_t spa_scan_pass_start; /* start time per pass/reboot */ uint64_t spa_scan_pass_scrub_pause; /* scrub pause time */ uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */ uint64_t spa_scan_pass_exam; /* examined bytes per pass */ uint64_t spa_scan_pass_issued; /* issued bytes per pass */ /* error scrub pause time in milliseconds */ uint64_t spa_scan_pass_errorscrub_pause; /* total error scrub paused time in milliseconds */ uint64_t spa_scan_pass_errorscrub_spent_paused; /* * We are in the middle of a resilver, and another resilver * is needed once this one completes. This is set iff any * vdev_resilver_deferred is set. */ boolean_t spa_resilver_deferred; kmutex_t spa_async_lock; /* protect async state */ kthread_t *spa_async_thread; /* thread doing async task */ int spa_async_suspended; /* async tasks suspended */ kcondvar_t spa_async_cv; /* wait for thread_exit() */ uint16_t spa_async_tasks; /* async task mask */ uint64_t spa_missing_tvds; /* unopenable tvds on load */ uint64_t spa_missing_tvds_allowed; /* allow loading spa? */ uint64_t spa_nonallocating_dspace; spa_removing_phys_t spa_removing_phys; spa_vdev_removal_t *spa_vdev_removal; spa_condensing_indirect_phys_t spa_condensing_indirect_phys; spa_condensing_indirect_t *spa_condensing_indirect; zthr_t *spa_condense_zthr; /* zthr doing condense. */ uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */ spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ zthr_t *spa_checkpoint_discard_zthr; space_map_t *spa_syncing_log_sm; /* current log space map */ avl_tree_t spa_sm_logs_by_txg; kmutex_t spa_flushed_ms_lock; /* for metaslabs_by_flushed */ avl_tree_t spa_metaslabs_by_flushed; spa_unflushed_stats_t spa_unflushed_stats; list_t spa_log_summary; uint64_t spa_log_flushall_txg; zthr_t *spa_livelist_delete_zthr; /* deleting livelists */ zthr_t *spa_livelist_condense_zthr; /* condensing livelists */ uint64_t spa_livelists_to_delete; /* set of livelists to free */ livelist_condense_entry_t spa_to_condense; /* next to condense */ char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ int spa_last_open_failed; /* error if last open failed */ uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */ uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */ uint64_t spa_load_txg; /* ub txg that loaded */ uint64_t spa_load_txg_ts; /* timestamp from that ub */ uint64_t spa_load_meta_errors; /* verify metadata err count */ uint64_t spa_load_data_errors; /* verify data err count */ uint64_t spa_verify_min_txg; /* start txg of verify scrub */ kmutex_t spa_errlog_lock; /* error log lock */ uint64_t spa_errlog_last; /* last error log object */ uint64_t spa_errlog_scrub; /* scrub error log object */ kmutex_t spa_errlist_lock; /* error list/ereport lock */ avl_tree_t spa_errlist_last; /* last error list */ avl_tree_t spa_errlist_scrub; /* scrub error list */ avl_tree_t spa_errlist_healed; /* list of healed blocks */ uint64_t spa_deflate; /* should we deflate? */ uint64_t spa_history; /* history object */ kmutex_t spa_history_lock; /* history lock */ vdev_t *spa_pending_vdev; /* pending vdev additions */ kmutex_t spa_props_lock; /* property lock */ uint64_t spa_pool_props_object; /* object for properties */ uint64_t spa_bootfs; /* default boot filesystem */ uint64_t spa_failmode; /* failure mode for the pool */ uint64_t spa_deadman_failmode; /* failure mode for deadman */ uint64_t spa_delegation; /* delegation on/off */ list_t spa_config_list; /* previous cache file(s) */ /* per-CPU array of root of async I/O: */ zio_t **spa_async_zio_root; zio_t *spa_suspend_zio_root; /* root of all suspended I/O */ zio_t *spa_txg_zio[TXG_SIZE]; /* spa_sync() waits for this */ kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ kcondvar_t spa_suspend_cv; /* notification of resume */ zio_suspend_reason_t spa_suspended; /* pool is suspended */ uint8_t spa_claiming; /* pool is doing zil_claim() */ boolean_t spa_is_root; /* pool is root */ int spa_minref; /* num refs when first opened */ spa_mode_t spa_mode; /* SPA_MODE_{READ|WRITE} */ boolean_t spa_read_spacemaps; /* spacemaps available if ro */ spa_log_state_t spa_log_state; /* log state */ uint64_t spa_autoexpand; /* lun expansion on/off */ ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */ uint64_t spa_ddt_stat_object; /* DDT statistics */ uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ struct brt *spa_brt; /* in-core BRT */ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ kmutex_t spa_proc_lock; /* protects spa_proc* */ kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ spa_proc_state_t spa_proc_state; /* see definition */ proc_t *spa_proc; /* "zpool-poolname" process */ uintptr_t spa_did; /* if procp != p0, did of t1 */ boolean_t spa_autoreplace; /* autoreplace set in open */ int spa_vdev_locks; /* locks grabbed */ uint64_t spa_creation_version; /* version at pool creation */ uint64_t spa_prev_software_version; /* See ub_software_version */ uint64_t spa_feat_for_write_obj; /* required to write to pool */ uint64_t spa_feat_for_read_obj; /* required to read from pool */ uint64_t spa_feat_desc_obj; /* Feature descriptions */ uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */ kmutex_t spa_feat_stats_lock; /* protects spa_feat_stats */ nvlist_t *spa_feat_stats; /* Cache of enabled features */ /* cache feature refcounts */ uint64_t spa_feat_refcount_cache[SPA_FEATURES]; taskqid_t spa_deadman_tqid; /* Task id */ uint64_t spa_deadman_calls; /* number of deadman calls */ hrtime_t spa_sync_starttime; /* starting time of spa_sync */ uint64_t spa_deadman_synctime; /* deadman sync expiration */ uint64_t spa_deadman_ziotime; /* deadman zio expiration */ uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */ spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */ uint64_t spa_autotrim; /* automatic background trim? */ uint64_t spa_errata; /* errata issues detected */ spa_stats_t spa_stats; /* assorted spa statistics */ spa_keystore_t spa_keystore; /* loaded crypto keys */ /* arc_memory_throttle() parameters during low memory condition */ uint64_t spa_lowmem_page_load; /* memory load during txg */ uint64_t spa_lowmem_last_txg; /* txg window start */ hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */ taskq_t *spa_zvol_taskq; /* Taskq for minor management */ + taskq_t *spa_metaslab_taskq; /* Taskq for metaslab preload */ taskq_t *spa_prefetch_taskq; /* Taskq for prefetch threads */ + taskq_t *spa_upgrade_taskq; /* Taskq for upgrade jobs */ uint64_t spa_multihost; /* multihost aware (mmp) */ mmp_thread_t spa_mmp; /* multihost mmp thread */ list_t spa_leaf_list; /* list of leaf vdevs */ uint64_t spa_leaf_list_gen; /* track leaf_list changes */ uint32_t spa_hostid; /* cached system hostid */ /* synchronization for threads in spa_wait */ kmutex_t spa_activities_lock; kcondvar_t spa_activities_cv; kcondvar_t spa_waiters_cv; int spa_waiters; /* number of waiting threads */ boolean_t spa_waiters_cancel; /* waiters should return */ char *spa_compatibility; /* compatibility file(s) */ /* * spa_refcount & spa_config_lock must be the last elements * because zfs_refcount_t changes size based on compilation options. * In order for the MDB module to function correctly, the other * fields must remain in the same location. */ spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ zfs_refcount_t spa_refcount; /* number of opens */ - - taskq_t *spa_upgrade_taskq; /* taskq for upgrade jobs */ }; extern char *spa_config_path; extern const char *zfs_deadman_failmode; extern uint_t spa_slop_shift; extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent); extern void spa_taskq_dispatch_sync(spa_t *, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags); extern void spa_load_spares(spa_t *spa); extern void spa_load_l2cache(spa_t *spa); extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name); extern void spa_event_post(sysevent_t *ev); extern int param_set_deadman_failmode_common(const char *val); extern void spa_set_deadman_synctime(hrtime_t ns); extern void spa_set_deadman_ziotime(hrtime_t ns); extern const char *spa_history_zone(void); extern const char *zfs_active_allocator; extern int param_set_active_allocator_common(const char *val); #ifdef __cplusplus } #endif #endif /* _SYS_SPA_IMPL_H */ diff --git a/sys/contrib/openzfs/man/man4/zfs.4 b/sys/contrib/openzfs/man/man4/zfs.4 index 3843419731b8..cfadd79d87f3 100644 --- a/sys/contrib/openzfs/man/man4/zfs.4 +++ b/sys/contrib/openzfs/man/man4/zfs.4 @@ -1,2587 +1,2598 @@ .\" .\" Copyright (c) 2013 by Turbo Fredriksson . All rights reserved. .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved. .\" Copyright (c) 2019 Datto Inc. .\" The contents of this file are subject to the terms of the Common Development .\" and Distribution License (the "License"). You may not use this file except .\" in compliance with the License. You can obtain a copy of the license at .\" usr/src/OPENSOLARIS.LICENSE or https://opensource.org/licenses/CDDL-1.0. .\" .\" See the License for the specific language governing permissions and .\" limitations under the License. When distributing Covered Code, include this .\" CDDL HEADER in each file and include the License file at .\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" .Dd July 21, 2023 .Dt ZFS 4 .Os . .Sh NAME .Nm zfs .Nd tuning of the ZFS kernel module . .Sh DESCRIPTION The ZFS module supports these parameters: .Bl -tag -width Ds .It Sy dbuf_cache_max_bytes Ns = Ns Sy UINT64_MAX Ns B Pq u64 Maximum size in bytes of the dbuf cache. The target size is determined by the MIN versus .No 1/2^ Ns Sy dbuf_cache_shift Pq 1/32nd of the target ARC size. The behavior of the dbuf cache and its associated settings can be observed via the .Pa /proc/spl/kstat/zfs/dbufstats kstat. . .It Sy dbuf_metadata_cache_max_bytes Ns = Ns Sy UINT64_MAX Ns B Pq u64 Maximum size in bytes of the metadata dbuf cache. The target size is determined by the MIN versus .No 1/2^ Ns Sy dbuf_metadata_cache_shift Pq 1/64th of the target ARC size. The behavior of the metadata dbuf cache and its associated settings can be observed via the .Pa /proc/spl/kstat/zfs/dbufstats kstat. . .It Sy dbuf_cache_hiwater_pct Ns = Ns Sy 10 Ns % Pq uint The percentage over .Sy dbuf_cache_max_bytes when dbufs must be evicted directly. . .It Sy dbuf_cache_lowater_pct Ns = Ns Sy 10 Ns % Pq uint The percentage below .Sy dbuf_cache_max_bytes when the evict thread stops evicting dbufs. . .It Sy dbuf_cache_shift Ns = Ns Sy 5 Pq uint Set the size of the dbuf cache .Pq Sy dbuf_cache_max_bytes to a log2 fraction of the target ARC size. . .It Sy dbuf_metadata_cache_shift Ns = Ns Sy 6 Pq uint Set the size of the dbuf metadata cache .Pq Sy dbuf_metadata_cache_max_bytes to a log2 fraction of the target ARC size. . .It Sy dbuf_mutex_cache_shift Ns = Ns Sy 0 Pq uint Set the size of the mutex array for the dbuf cache. When set to .Sy 0 the array is dynamically sized based on total system memory. . .It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq uint dnode slots allocated in a single operation as a power of 2. The default value minimizes lock contention for the bulk operation performed. . .It Sy dmu_prefetch_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint Limit the amount we can prefetch with one call to this amount in bytes. This helps to limit the amount of memory that can be used by prefetching. . .It Sy ignore_hole_birth Pq int Alias for .Sy send_holes_without_birth_time . . .It Sy l2arc_feed_again Ns = Ns Sy 1 Ns | Ns 0 Pq int Turbo L2ARC warm-up. When the L2ARC is cold the fill interval will be set as fast as possible. . .It Sy l2arc_feed_min_ms Ns = Ns Sy 200 Pq u64 Min feed interval in milliseconds. Requires .Sy l2arc_feed_again Ns = Ns Ar 1 and only applicable in related situations. . .It Sy l2arc_feed_secs Ns = Ns Sy 1 Pq u64 Seconds between L2ARC writing. . .It Sy l2arc_headroom Ns = Ns Sy 2 Pq u64 How far through the ARC lists to search for L2ARC cacheable content, expressed as a multiplier of .Sy l2arc_write_max . ARC persistence across reboots can be achieved with persistent L2ARC by setting this parameter to .Sy 0 , allowing the full length of ARC lists to be searched for cacheable content. . .It Sy l2arc_headroom_boost Ns = Ns Sy 200 Ns % Pq u64 Scales .Sy l2arc_headroom by this percentage when L2ARC contents are being successfully compressed before writing. A value of .Sy 100 disables this feature. . .It Sy l2arc_exclude_special Ns = Ns Sy 0 Ns | Ns 1 Pq int Controls whether buffers present on special vdevs are eligible for caching into L2ARC. If set to 1, exclude dbufs on special vdevs from being cached to L2ARC. . .It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq int Controls whether only MFU metadata and data are cached from ARC into L2ARC. This may be desired to avoid wasting space on L2ARC when reading/writing large amounts of data that are not expected to be accessed more than once. .Pp The default is off, meaning both MRU and MFU data and metadata are cached. When turning off this feature, some MRU buffers will still be present in ARC and eventually cached on L2ARC. .No If Sy l2arc_noprefetch Ns = Ns Sy 0 , some prefetched buffers will be cached to L2ARC, and those might later transition to MRU, in which case the .Sy l2arc_mru_asize No arcstat will not be Sy 0 . .Pp Regardless of .Sy l2arc_noprefetch , some MFU buffers might be evicted from ARC, accessed later on as prefetches and transition to MRU as prefetches. If accessed again they are counted as MRU and the .Sy l2arc_mru_asize No arcstat will not be Sy 0 . .Pp The ARC status of L2ARC buffers when they were first cached in L2ARC can be seen in the .Sy l2arc_mru_asize , Sy l2arc_mfu_asize , No and Sy l2arc_prefetch_asize arcstats when importing the pool or onlining a cache device if persistent L2ARC is enabled. .Pp The .Sy evict_l2_eligible_mru arcstat does not take into account if this option is enabled as the information provided by the .Sy evict_l2_eligible_m[rf]u arcstats can be used to decide if toggling this option is appropriate for the current workload. . .It Sy l2arc_meta_percent Ns = Ns Sy 33 Ns % Pq uint Percent of ARC size allowed for L2ARC-only headers. Since L2ARC buffers are not evicted on memory pressure, too many headers on a system with an irrationally large L2ARC can render it slow or unusable. This parameter limits L2ARC writes and rebuilds to achieve the target. . .It Sy l2arc_trim_ahead Ns = Ns Sy 0 Ns % Pq u64 Trims ahead of the current write size .Pq Sy l2arc_write_max on L2ARC devices by this percentage of write size if we have filled the device. If set to .Sy 100 we TRIM twice the space required to accommodate upcoming writes. A minimum of .Sy 64 MiB will be trimmed. It also enables TRIM of the whole L2ARC device upon creation or addition to an existing pool or if the header of the device is invalid upon importing a pool or onlining a cache device. A value of .Sy 0 disables TRIM on L2ARC altogether and is the default as it can put significant stress on the underlying storage devices. This will vary depending of how well the specific device handles these commands. . .It Sy l2arc_noprefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int Do not write buffers to L2ARC if they were prefetched but not used by applications. In case there are prefetched buffers in L2ARC and this option is later set, we do not read the prefetched buffers from L2ARC. Unsetting this option is useful for caching sequential reads from the disks to L2ARC and serve those reads from L2ARC later on. This may be beneficial in case the L2ARC device is significantly faster in sequential reads than the disks of the pool. .Pp Use .Sy 1 to disable and .Sy 0 to enable caching/reading prefetches to/from L2ARC. . .It Sy l2arc_norw Ns = Ns Sy 0 Ns | Ns 1 Pq int No reads during writes. . .It Sy l2arc_write_boost Ns = Ns Sy 8388608 Ns B Po 8 MiB Pc Pq u64 Cold L2ARC devices will have .Sy l2arc_write_max increased by this amount while they remain cold. . .It Sy l2arc_write_max Ns = Ns Sy 8388608 Ns B Po 8 MiB Pc Pq u64 Max write bytes per interval. . .It Sy l2arc_rebuild_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Rebuild the L2ARC when importing a pool (persistent L2ARC). This can be disabled if there are problems importing a pool or attaching an L2ARC device (e.g. the L2ARC device is slow in reading stored log metadata, or the metadata has become somehow fragmented/unusable). . .It Sy l2arc_rebuild_blocks_min_l2size Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64 Mininum size of an L2ARC device required in order to write log blocks in it. The log blocks are used upon importing the pool to rebuild the persistent L2ARC. .Pp For L2ARC devices less than 1 GiB, the amount of data .Fn l2arc_evict evicts is significant compared to the amount of restored L2ARC data. In this case, do not write log blocks in L2ARC in order not to waste space. . .It Sy metaslab_aliquot Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 Metaslab granularity, in bytes. This is roughly similar to what would be referred to as the "stripe size" in traditional RAID arrays. In normal operation, ZFS will try to write this amount of data to each disk before moving on to the next top-level vdev. . .It Sy metaslab_bias_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable metaslab group biasing based on their vdevs' over- or under-utilization relative to the pool. . .It Sy metaslab_force_ganging Ns = Ns Sy 16777217 Ns B Po 16 MiB + 1 B Pc Pq u64 Make some blocks above a certain size be gang blocks. This option is used by the test suite to facilitate testing. . .It Sy metaslab_force_ganging_pct Ns = Ns Sy 3 Ns % Pq uint For blocks that could be forced to be a gang block (due to .Sy metaslab_force_ganging ) , force this many of them to be gang blocks. . .It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int Default DDT ZAP data block size as a power of 2. Note that changing this after creating a DDT on the pool will not affect existing DDTs, only newly created ones. . .It Sy zfs_ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int Default DDT ZAP indirect block size as a power of 2. Note that changing this after creating a DDT on the pool will not affect existing DDTs, only newly created ones. . .It Sy zfs_default_bs Ns = Ns Sy 9 Po 512 B Pc Pq int Default dnode block size as a power of 2. . .It Sy zfs_default_ibs Ns = Ns Sy 17 Po 128 KiB Pc Pq int Default dnode indirect block size as a power of 2. . .It Sy zfs_history_output_max Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 When attempting to log an output nvlist of an ioctl in the on-disk history, the output will not be stored if it is larger than this size (in bytes). This must be less than .Sy DMU_MAX_ACCESS Pq 64 MiB . This applies primarily to .Fn zfs_ioc_channel_program Pq cf. Xr zfs-program 8 . . .It Sy zfs_keep_log_spacemaps_at_export Ns = Ns Sy 0 Ns | Ns 1 Pq int Prevent log spacemaps from being destroyed during pool exports and destroys. . .It Sy zfs_metaslab_segment_weight_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable/disable segment-based metaslab selection. . .It Sy zfs_metaslab_switch_threshold Ns = Ns Sy 2 Pq int When using segment-based metaslab selection, continue allocating from the active metaslab until this option's worth of buckets have been exhausted. . .It Sy metaslab_debug_load Ns = Ns Sy 0 Ns | Ns 1 Pq int Load all metaslabs during pool import. . .It Sy metaslab_debug_unload Ns = Ns Sy 0 Ns | Ns 1 Pq int Prevent metaslabs from being unloaded. . .It Sy metaslab_fragmentation_factor_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable use of the fragmentation metric in computing metaslab weights. . .It Sy metaslab_df_max_search Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint Maximum distance to search forward from the last offset. Without this limit, fragmented pools can see .Em >100`000 iterations and .Fn metaslab_block_picker becomes the performance limiting factor on high-performance storage. .Pp With the default setting of .Sy 16 MiB , we typically see less than .Em 500 iterations, even with very fragmented .Sy ashift Ns = Ns Sy 9 pools. The maximum number of iterations possible is .Sy metaslab_df_max_search / 2^(ashift+1) . With the default setting of .Sy 16 MiB this is .Em 16*1024 Pq with Sy ashift Ns = Ns Sy 9 or .Em 2*1024 Pq with Sy ashift Ns = Ns Sy 12 . . .It Sy metaslab_df_use_largest_segment Ns = Ns Sy 0 Ns | Ns 1 Pq int If not searching forward (due to .Sy metaslab_df_max_search , metaslab_df_free_pct , .No or Sy metaslab_df_alloc_threshold ) , this tunable controls which segment is used. If set, we will use the largest free segment. If unset, we will use a segment of at least the requested size. . .It Sy zfs_metaslab_max_size_cache_sec Ns = Ns Sy 3600 Ns s Po 1 hour Pc Pq u64 When we unload a metaslab, we cache the size of the largest free chunk. We use that cached size to determine whether or not to load a metaslab for a given allocation. As more frees accumulate in that metaslab while it's unloaded, the cached max size becomes less and less accurate. After a number of seconds controlled by this tunable, we stop considering the cached max size and start considering only the histogram instead. . .It Sy zfs_metaslab_mem_limit Ns = Ns Sy 25 Ns % Pq uint When we are loading a new metaslab, we check the amount of memory being used to store metaslab range trees. If it is over a threshold, we attempt to unload the least recently used metaslab to prevent the system from clogging all of its memory with range trees. This tunable sets the percentage of total system memory that is the threshold. . .It Sy zfs_metaslab_try_hard_before_gang Ns = Ns Sy 0 Ns | Ns 1 Pq int .Bl -item -compact .It If unset, we will first try normal allocation. .It If that fails then we will do a gang allocation. .It If that fails then we will do a "try hard" gang allocation. .It If that fails then we will have a multi-layer gang block. .El .Pp .Bl -item -compact .It If set, we will first try normal allocation. .It If that fails then we will do a "try hard" allocation. .It If that fails we will do a gang allocation. .It If that fails we will do a "try hard" gang allocation. .It If that fails then we will have a multi-layer gang block. .El . .It Sy zfs_metaslab_find_max_tries Ns = Ns Sy 100 Pq uint When not trying hard, we only consider this number of the best metaslabs. This improves performance, especially when there are many metaslabs per vdev and the allocation can't actually be satisfied (so we would otherwise iterate all metaslabs). . .It Sy zfs_vdev_default_ms_count Ns = Ns Sy 200 Pq uint When a vdev is added, target this number of metaslabs per top-level vdev. . .It Sy zfs_vdev_default_ms_shift Ns = Ns Sy 29 Po 512 MiB Pc Pq uint Default lower limit for metaslab size. . .It Sy zfs_vdev_max_ms_shift Ns = Ns Sy 34 Po 16 GiB Pc Pq uint Default upper limit for metaslab size. . .It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy 14 Pq uint Maximum ashift used when optimizing for logical \[->] physical sector size on new top-level vdevs. May be increased up to .Sy ASHIFT_MAX Po 16 Pc , but this may negatively impact pool space efficiency. . .It Sy zfs_vdev_min_auto_ashift Ns = Ns Sy ASHIFT_MIN Po 9 Pc Pq uint Minimum ashift used when creating new top-level vdevs. . .It Sy zfs_vdev_min_ms_count Ns = Ns Sy 16 Pq uint Minimum number of metaslabs to create in a top-level vdev. . .It Sy vdev_validate_skip Ns = Ns Sy 0 Ns | Ns 1 Pq int Skip label validation steps during pool import. Changing is not recommended unless you know what you're doing and are recovering a damaged label. . .It Sy zfs_vdev_ms_count_limit Ns = Ns Sy 131072 Po 128k Pc Pq uint Practical upper limit of total metaslabs per top-level vdev. . .It Sy metaslab_preload_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable metaslab group preloading. . +.It Sy metaslab_preload_limit Ns = Ns Sy 10 Pq uint +Maximum number of metaslabs per group to preload +. +.It Sy metaslab_preload_pct Ns = Ns Sy 50 Pq uint +Percentage of CPUs to run a metaslab preload taskq +. .It Sy metaslab_lba_weighting_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Give more weight to metaslabs with lower LBAs, assuming they have greater bandwidth, as is typically the case on a modern constant angular velocity disk drive. . .It Sy metaslab_unload_delay Ns = Ns Sy 32 Pq uint After a metaslab is used, we keep it loaded for this many TXGs, to attempt to reduce unnecessary reloading. Note that both this many TXGs and .Sy metaslab_unload_delay_ms milliseconds must pass before unloading will occur. . .It Sy metaslab_unload_delay_ms Ns = Ns Sy 600000 Ns ms Po 10 min Pc Pq uint After a metaslab is used, we keep it loaded for this many milliseconds, to attempt to reduce unnecessary reloading. Note, that both this many milliseconds and .Sy metaslab_unload_delay TXGs must pass before unloading will occur. . .It Sy reference_history Ns = Ns Sy 3 Pq uint Maximum reference holders being tracked when reference_tracking_enable is active. . .It Sy reference_tracking_enable Ns = Ns Sy 0 Ns | Ns 1 Pq int Track reference holders to .Sy refcount_t objects (debug builds only). . .It Sy send_holes_without_birth_time Ns = Ns Sy 1 Ns | Ns 0 Pq int When set, the .Sy hole_birth optimization will not be used, and all holes will always be sent during a .Nm zfs Cm send . This is useful if you suspect your datasets are affected by a bug in .Sy hole_birth . . .It Sy spa_config_path Ns = Ns Pa /etc/zfs/zpool.cache Pq charp SPA config file. . .It Sy spa_asize_inflation Ns = Ns Sy 24 Pq uint Multiplication factor used to estimate actual disk consumption from the size of data being written. The default value is a worst case estimate, but lower values may be valid for a given pool depending on its configuration. Pool administrators who understand the factors involved may wish to specify a more realistic inflation factor, particularly if they operate close to quota or capacity limits. . .It Sy spa_load_print_vdev_tree Ns = Ns Sy 0 Ns | Ns 1 Pq int Whether to print the vdev tree in the debugging message buffer during pool import. . .It Sy spa_load_verify_data Ns = Ns Sy 1 Ns | Ns 0 Pq int Whether to traverse data blocks during an "extreme rewind" .Pq Fl X import. .Pp An extreme rewind import normally performs a full traversal of all blocks in the pool for verification. If this parameter is unset, the traversal skips non-metadata blocks. It can be toggled once the import has started to stop or start the traversal of non-metadata blocks. . .It Sy spa_load_verify_metadata Ns = Ns Sy 1 Ns | Ns 0 Pq int Whether to traverse blocks during an "extreme rewind" .Pq Fl X pool import. .Pp An extreme rewind import normally performs a full traversal of all blocks in the pool for verification. If this parameter is unset, the traversal is not performed. It can be toggled once the import has started to stop or start the traversal. . .It Sy spa_load_verify_shift Ns = Ns Sy 4 Po 1/16th Pc Pq uint Sets the maximum number of bytes to consume during pool import to the log2 fraction of the target ARC size. . .It Sy spa_slop_shift Ns = Ns Sy 5 Po 1/32nd Pc Pq int Normally, we don't allow the last .Sy 3.2% Pq Sy 1/2^spa_slop_shift of space in the pool to be consumed. This ensures that we don't run the pool completely out of space, due to unaccounted changes (e.g. to the MOS). It also limits the worst-case time to allocate space. If we have less than this amount of free space, most ZPL operations (e.g. write, create) will return .Sy ENOSPC . . .It Sy spa_upgrade_errlog_limit Ns = Ns Sy 0 Pq uint Limits the number of on-disk error log entries that will be converted to the new format when enabling the .Sy head_errlog feature. The default is to convert all log entries. . .It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint During top-level vdev removal, chunks of data are copied from the vdev which may include free space in order to trade bandwidth for IOPS. This parameter determines the maximum span of free space, in bytes, which will be included as "unnecessary" data in a chunk of copied data. .Pp The default value here was chosen to align with .Sy zfs_vdev_read_gap_limit , which is a similar concept when doing regular reads (but there's no reason it has to be the same). . .It Sy vdev_file_logical_ashift Ns = Ns Sy 9 Po 512 B Pc Pq u64 Logical ashift for file-based devices. . .It Sy vdev_file_physical_ashift Ns = Ns Sy 9 Po 512 B Pc Pq u64 Physical ashift for file-based devices. . .It Sy zap_iterate_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int If set, when we start iterating over a ZAP object, prefetch the entire object (all leaf blocks). However, this is limited by .Sy dmu_prefetch_max . . .It Sy zap_micro_max_size Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq int Maximum micro ZAP size. A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size. . .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint Min bytes to prefetch per stream. Prefetch distance starts from the demand access size and quickly grows to this value, doubling on each hit. After that it may grow further by 1/8 per hit, but only if some prefetch since last time haven't completed in time to satisfy demand request, i.e. prefetch depth didn't cover the read latency or the pool got saturated. . .It Sy zfetch_max_distance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint Max bytes to prefetch per stream. . .It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint Max bytes to prefetch indirects for per stream. . .It Sy zfetch_max_streams Ns = Ns Sy 8 Pq uint Max number of streams per zfetch (prefetch streams per file). . .It Sy zfetch_min_sec_reap Ns = Ns Sy 1 Pq uint Min time before inactive prefetch stream can be reclaimed . .It Sy zfetch_max_sec_reap Ns = Ns Sy 2 Pq uint Max time before inactive prefetch stream can be deleted . .It Sy zfs_abd_scatter_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enables ARC from using scatter/gather lists and forces all allocations to be linear in kernel memory. Disabling can improve performance in some code paths at the expense of fragmented kernel memory. . .It Sy zfs_abd_scatter_max_order Ns = Ns Sy MAX_ORDER\-1 Pq uint Maximum number of consecutive memory pages allocated in a single block for scatter/gather lists. .Pp The value of .Sy MAX_ORDER depends on kernel configuration. . .It Sy zfs_abd_scatter_min_size Ns = Ns Sy 1536 Ns B Po 1.5 KiB Pc Pq uint This is the minimum allocation size that will use scatter (page-based) ABDs. Smaller allocations will use linear ABDs. . .It Sy zfs_arc_dnode_limit Ns = Ns Sy 0 Ns B Pq u64 When the number of bytes consumed by dnodes in the ARC exceeds this number of bytes, try to unpin some of it in response to demand for non-metadata. This value acts as a ceiling to the amount of dnode metadata, and defaults to .Sy 0 , which indicates that a percent which is based on .Sy zfs_arc_dnode_limit_percent of the ARC meta buffers that may be used for dnodes. .It Sy zfs_arc_dnode_limit_percent Ns = Ns Sy 10 Ns % Pq u64 Percentage that can be consumed by dnodes of ARC meta buffers. .Pp See also .Sy zfs_arc_dnode_limit , which serves a similar purpose but has a higher priority if nonzero. . .It Sy zfs_arc_dnode_reduce_percent Ns = Ns Sy 10 Ns % Pq u64 Percentage of ARC dnodes to try to scan in response to demand for non-metadata when the number of bytes consumed by dnodes exceeds .Sy zfs_arc_dnode_limit . . .It Sy zfs_arc_average_blocksize Ns = Ns Sy 8192 Ns B Po 8 KiB Pc Pq uint The ARC's buffer hash table is sized based on the assumption of an average block size of this value. This works out to roughly 1 MiB of hash table per 1 GiB of physical memory with 8-byte pointers. For configurations with a known larger average block size, this value can be increased to reduce the memory footprint. . .It Sy zfs_arc_eviction_pct Ns = Ns Sy 200 Ns % Pq uint When .Fn arc_is_overflowing , .Fn arc_get_data_impl waits for this percent of the requested amount of data to be evicted. For example, by default, for every .Em 2 KiB that's evicted, .Em 1 KiB of it may be "reused" by a new allocation. Since this is above .Sy 100 Ns % , it ensures that progress is made towards getting .Sy arc_size No under Sy arc_c . Since this is finite, it ensures that allocations can still happen, even during the potentially long time that .Sy arc_size No is more than Sy arc_c . . .It Sy zfs_arc_evict_batch_limit Ns = Ns Sy 10 Pq uint Number ARC headers to evict per sub-list before proceeding to another sub-list. This batch-style operation prevents entire sub-lists from being evicted at once but comes at a cost of additional unlocking and locking. . .It Sy zfs_arc_grow_retry Ns = Ns Sy 0 Ns s Pq uint If set to a non zero value, it will replace the .Sy arc_grow_retry value with this value. The .Sy arc_grow_retry .No value Pq default Sy 5 Ns s is the number of seconds the ARC will wait before trying to resume growth after a memory pressure event. . .It Sy zfs_arc_lotsfree_percent Ns = Ns Sy 10 Ns % Pq int Throttle I/O when free system memory drops below this percentage of total system memory. Setting this value to .Sy 0 will disable the throttle. . .It Sy zfs_arc_max Ns = Ns Sy 0 Ns B Pq u64 Max size of ARC in bytes. If .Sy 0 , then the max size of ARC is determined by the amount of system memory installed. Under Linux, half of system memory will be used as the limit. Under .Fx , the larger of .Sy all_system_memory No \- Sy 1 GiB and .Sy 5/8 No \(mu Sy all_system_memory will be used as the limit. This value must be at least .Sy 67108864 Ns B Pq 64 MiB . .Pp This value can be changed dynamically, with some caveats. It cannot be set back to .Sy 0 while running, and reducing it below the current ARC size will not cause the ARC to shrink without memory pressure to induce shrinking. . .It Sy zfs_arc_meta_balance Ns = Ns Sy 500 Pq uint Balance between metadata and data on ghost hits. Values above 100 increase metadata caching by proportionally reducing effect of ghost data hits on target data/metadata rate. . .It Sy zfs_arc_min Ns = Ns Sy 0 Ns B Pq u64 Min size of ARC in bytes. .No If set to Sy 0 , arc_c_min will default to consuming the larger of .Sy 32 MiB and .Sy all_system_memory No / Sy 32 . . .It Sy zfs_arc_min_prefetch_ms Ns = Ns Sy 0 Ns ms Ns Po Ns ≡ Ns 1s Pc Pq uint Minimum time prefetched blocks are locked in the ARC. . .It Sy zfs_arc_min_prescient_prefetch_ms Ns = Ns Sy 0 Ns ms Ns Po Ns ≡ Ns 6s Pc Pq uint Minimum time "prescient prefetched" blocks are locked in the ARC. These blocks are meant to be prefetched fairly aggressively ahead of the code that may use them. . .It Sy zfs_arc_prune_task_threads Ns = Ns Sy 1 Pq int Number of arc_prune threads. .Fx does not need more than one. Linux may theoretically use one per mount point up to number of CPUs, but that was not proven to be useful. . .It Sy zfs_max_missing_tvds Ns = Ns Sy 0 Pq int Number of missing top-level vdevs which will be allowed during pool import (only in read-only mode). . .It Sy zfs_max_nvlist_src_size Ns = Sy 0 Pq u64 Maximum size in bytes allowed to be passed as .Sy zc_nvlist_src_size for ioctls on .Pa /dev/zfs . This prevents a user from causing the kernel to allocate an excessive amount of memory. When the limit is exceeded, the ioctl fails with .Sy EINVAL and a description of the error is sent to the .Pa zfs-dbgmsg log. This parameter should not need to be touched under normal circumstances. If .Sy 0 , equivalent to a quarter of the user-wired memory limit under .Fx and to .Sy 134217728 Ns B Pq 128 MiB under Linux. . .It Sy zfs_multilist_num_sublists Ns = Ns Sy 0 Pq uint To allow more fine-grained locking, each ARC state contains a series of lists for both data and metadata objects. Locking is performed at the level of these "sub-lists". This parameters controls the number of sub-lists per ARC state, and also applies to other uses of the multilist data structure. .Pp If .Sy 0 , equivalent to the greater of the number of online CPUs and .Sy 4 . . .It Sy zfs_arc_overflow_shift Ns = Ns Sy 8 Pq int The ARC size is considered to be overflowing if it exceeds the current ARC target size .Pq Sy arc_c by thresholds determined by this parameter. Exceeding by .Sy ( arc_c No >> Sy zfs_arc_overflow_shift ) No / Sy 2 starts ARC reclamation process. If that appears insufficient, exceeding by .Sy ( arc_c No >> Sy zfs_arc_overflow_shift ) No \(mu Sy 1.5 blocks new buffer allocation until the reclaim thread catches up. Started reclamation process continues till ARC size returns below the target size. .Pp The default value of .Sy 8 causes the ARC to start reclamation if it exceeds the target size by .Em 0.2% of the target size, and block allocations by .Em 0.6% . . .It Sy zfs_arc_shrink_shift Ns = Ns Sy 0 Pq uint If nonzero, this will update .Sy arc_shrink_shift Pq default Sy 7 with the new value. . .It Sy zfs_arc_pc_percent Ns = Ns Sy 0 Ns % Po off Pc Pq uint Percent of pagecache to reclaim ARC to. .Pp This tunable allows the ZFS ARC to play more nicely with the kernel's LRU pagecache. It can guarantee that the ARC size won't collapse under scanning pressure on the pagecache, yet still allows the ARC to be reclaimed down to .Sy zfs_arc_min if necessary. This value is specified as percent of pagecache size (as measured by .Sy NR_FILE_PAGES ) , where that percent may exceed .Sy 100 . This only operates during memory pressure/reclaim. . .It Sy zfs_arc_shrinker_limit Ns = Ns Sy 10000 Pq int This is a limit on how many pages the ARC shrinker makes available for eviction in response to one page allocation attempt. Note that in practice, the kernel's shrinker can ask us to evict up to about four times this for one allocation attempt. .Pp The default limit of .Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages limits the amount of time spent attempting to reclaim ARC memory to less than 100 ms per allocation attempt, even with a small average compressed block size of ~8 KiB. .Pp The parameter can be set to 0 (zero) to disable the limit, and only applies on Linux. . .It Sy zfs_arc_sys_free Ns = Ns Sy 0 Ns B Pq u64 The target number of bytes the ARC should leave as free memory on the system. If zero, equivalent to the bigger of .Sy 512 KiB No and Sy all_system_memory/64 . . .It Sy zfs_autoimport_disable Ns = Ns Sy 1 Ns | Ns 0 Pq int Disable pool import at module load by ignoring the cache file .Pq Sy spa_config_path . . .It Sy zfs_checksum_events_per_second Ns = Ns Sy 20 Ns /s Pq uint Rate limit checksum events to this many per second. Note that this should not be set below the ZED thresholds (currently 10 checksums over 10 seconds) or else the daemon may not trigger any action. . .It Sy zfs_commit_timeout_pct Ns = Ns Sy 5 Ns % Pq uint This controls the amount of time that a ZIL block (lwb) will remain "open" when it isn't "full", and it has a thread waiting for it to be committed to stable storage. The timeout is scaled based on a percentage of the last lwb latency to avoid significantly impacting the latency of each individual transaction record (itx). . .It Sy zfs_condense_indirect_commit_entry_delay_ms Ns = Ns Sy 0 Ns ms Pq int Vdev indirection layer (used for device removal) sleeps for this many milliseconds during mapping generation. Intended for use with the test suite to throttle vdev removal speed. . .It Sy zfs_condense_indirect_obsolete_pct Ns = Ns Sy 25 Ns % Pq uint Minimum percent of obsolete bytes in vdev mapping required to attempt to condense .Pq see Sy zfs_condense_indirect_vdevs_enable . Intended for use with the test suite to facilitate triggering condensing as needed. . .It Sy zfs_condense_indirect_vdevs_enable Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable condensing indirect vdev mappings. When set, attempt to condense indirect vdev mappings if the mapping uses more than .Sy zfs_condense_min_mapping_bytes bytes of memory and if the obsolete space map object uses more than .Sy zfs_condense_max_obsolete_bytes bytes on-disk. The condensing process is an attempt to save memory by removing obsolete mappings. . .It Sy zfs_condense_max_obsolete_bytes Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64 Only attempt to condense indirect vdev mappings if the on-disk size of the obsolete space map object is greater than this number of bytes .Pq see Sy zfs_condense_indirect_vdevs_enable . . .It Sy zfs_condense_min_mapping_bytes Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq u64 Minimum size vdev mapping to attempt to condense .Pq see Sy zfs_condense_indirect_vdevs_enable . . .It Sy zfs_dbgmsg_enable Ns = Ns Sy 1 Ns | Ns 0 Pq int Internally ZFS keeps a small log to facilitate debugging. The log is enabled by default, and can be disabled by unsetting this option. The contents of the log can be accessed by reading .Pa /proc/spl/kstat/zfs/dbgmsg . Writing .Sy 0 to the file clears the log. .Pp This setting does not influence debug prints due to .Sy zfs_flags . . .It Sy zfs_dbgmsg_maxsize Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint Maximum size of the internal ZFS debug log. . .It Sy zfs_dbuf_state_index Ns = Ns Sy 0 Pq int Historically used for controlling what reporting was available under .Pa /proc/spl/kstat/zfs . No effect. . .It Sy zfs_deadman_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int When a pool sync operation takes longer than .Sy zfs_deadman_synctime_ms , or when an individual I/O operation takes longer than .Sy zfs_deadman_ziotime_ms , then the operation is considered to be "hung". If .Sy zfs_deadman_enabled is set, then the deadman behavior is invoked as described by .Sy zfs_deadman_failmode . By default, the deadman is enabled and set to .Sy wait which results in "hung" I/O operations only being logged. The deadman is automatically disabled when a pool gets suspended. . .It Sy zfs_deadman_failmode Ns = Ns Sy wait Pq charp Controls the failure behavior when the deadman detects a "hung" I/O operation. Valid values are: .Bl -tag -compact -offset 4n -width "continue" .It Sy wait Wait for a "hung" operation to complete. For each "hung" operation a "deadman" event will be posted describing that operation. .It Sy continue Attempt to recover from a "hung" operation by re-dispatching it to the I/O pipeline if possible. .It Sy panic Panic the system. This can be used to facilitate automatic fail-over to a properly configured fail-over partner. .El . .It Sy zfs_deadman_checktime_ms Ns = Ns Sy 60000 Ns ms Po 1 min Pc Pq u64 Check time in milliseconds. This defines the frequency at which we check for hung I/O requests and potentially invoke the .Sy zfs_deadman_failmode behavior. . .It Sy zfs_deadman_synctime_ms Ns = Ns Sy 600000 Ns ms Po 10 min Pc Pq u64 Interval in milliseconds after which the deadman is triggered and also the interval after which a pool sync operation is considered to be "hung". Once this limit is exceeded the deadman will be invoked every .Sy zfs_deadman_checktime_ms milliseconds until the pool sync completes. . .It Sy zfs_deadman_ziotime_ms Ns = Ns Sy 300000 Ns ms Po 5 min Pc Pq u64 Interval in milliseconds after which the deadman is triggered and an individual I/O operation is considered to be "hung". As long as the operation remains "hung", the deadman will be invoked every .Sy zfs_deadman_checktime_ms milliseconds until the operation completes. . .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int Enable prefetching dedup-ed blocks which are going to be freed. . .It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint Start to delay each transaction once there is this amount of dirty data, expressed as a percentage of .Sy zfs_dirty_data_max . This value should be at least .Sy zfs_vdev_async_write_active_max_dirty_percent . .No See Sx ZFS TRANSACTION DELAY . . .It Sy zfs_delay_scale Ns = Ns Sy 500000 Pq int This controls how quickly the transaction delay approaches infinity. Larger values cause longer delays for a given amount of dirty data. .Pp For the smoothest delay, this value should be about 1 billion divided by the maximum number of operations per second. This will smoothly handle between ten times and a tenth of this number. .No See Sx ZFS TRANSACTION DELAY . .Pp .Sy zfs_delay_scale No \(mu Sy zfs_dirty_data_max Em must No be smaller than Sy 2^64 . . .It Sy zfs_disable_ivset_guid_check Ns = Ns Sy 0 Ns | Ns 1 Pq int Disables requirement for IVset GUIDs to be present and match when doing a raw receive of encrypted datasets. Intended for users whose pools were created with OpenZFS pre-release versions and now have compatibility issues. . .It Sy zfs_key_max_salt_uses Ns = Ns Sy 400000000 Po 4*10^8 Pc Pq ulong Maximum number of uses of a single salt value before generating a new one for encrypted datasets. The default value is also the maximum. . .It Sy zfs_object_mutex_size Ns = Ns Sy 64 Pq uint Size of the znode hashtable used for holds. .Pp Due to the need to hold locks on objects that may not exist yet, kernel mutexes are not created per-object and instead a hashtable is used where collisions will result in objects waiting when there is not actually contention on the same object. . .It Sy zfs_slow_io_events_per_second Ns = Ns Sy 20 Ns /s Pq int Rate limit delay and deadman zevents (which report slow I/O operations) to this many per second. . .It Sy zfs_unflushed_max_mem_amt Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64 Upper-bound limit for unflushed metadata changes to be held by the log spacemap in memory, in bytes. . .It Sy zfs_unflushed_max_mem_ppm Ns = Ns Sy 1000 Ns ppm Po 0.1% Pc Pq u64 Part of overall system memory that ZFS allows to be used for unflushed metadata changes by the log spacemap, in millionths. . .It Sy zfs_unflushed_log_block_max Ns = Ns Sy 131072 Po 128k Pc Pq u64 Describes the maximum number of log spacemap blocks allowed for each pool. The default value means that the space in all the log spacemaps can add up to no more than .Sy 131072 blocks (which means .Em 16 GiB of logical space before compression and ditto blocks, assuming that blocksize is .Em 128 KiB ) . .Pp This tunable is important because it involves a trade-off between import time after an unclean export and the frequency of flushing metaslabs. The higher this number is, the more log blocks we allow when the pool is active which means that we flush metaslabs less often and thus decrease the number of I/O operations for spacemap updates per TXG. At the same time though, that means that in the event of an unclean export, there will be more log spacemap blocks for us to read, inducing overhead in the import time of the pool. The lower the number, the amount of flushing increases, destroying log blocks quicker as they become obsolete faster, which leaves less blocks to be read during import time after a crash. .Pp Each log spacemap block existing during pool import leads to approximately one extra logical I/O issued. This is the reason why this tunable is exposed in terms of blocks rather than space used. . .It Sy zfs_unflushed_log_block_min Ns = Ns Sy 1000 Pq u64 If the number of metaslabs is small and our incoming rate is high, we could get into a situation that we are flushing all our metaslabs every TXG. Thus we always allow at least this many log blocks. . .It Sy zfs_unflushed_log_block_pct Ns = Ns Sy 400 Ns % Pq u64 Tunable used to determine the number of blocks that can be used for the spacemap log, expressed as a percentage of the total number of unflushed metaslabs in the pool. . .It Sy zfs_unflushed_log_txg_max Ns = Ns Sy 1000 Pq u64 Tunable limiting maximum time in TXGs any metaslab may remain unflushed. It effectively limits maximum number of unflushed per-TXG spacemap logs that need to be read after unclean pool export. . .It Sy zfs_unlink_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq uint When enabled, files will not be asynchronously removed from the list of pending unlinks and the space they consume will be leaked. Once this option has been disabled and the dataset is remounted, the pending unlinks will be processed and the freed space returned to the pool. This option is used by the test suite. . .It Sy zfs_delete_blocks Ns = Ns Sy 20480 Pq ulong This is the used to define a large file for the purposes of deletion. Files containing more than .Sy zfs_delete_blocks will be deleted asynchronously, while smaller files are deleted synchronously. Decreasing this value will reduce the time spent in an .Xr unlink 2 system call, at the expense of a longer delay before the freed space is available. This only applies on Linux. . .It Sy zfs_dirty_data_max Ns = Pq int Determines the dirty space limit in bytes. Once this limit is exceeded, new writes are halted until space frees up. This parameter takes precedence over .Sy zfs_dirty_data_max_percent . .No See Sx ZFS TRANSACTION DELAY . .Pp Defaults to .Sy physical_ram/10 , capped at .Sy zfs_dirty_data_max_max . . .It Sy zfs_dirty_data_max_max Ns = Pq int Maximum allowable value of .Sy zfs_dirty_data_max , expressed in bytes. This limit is only enforced at module load time, and will be ignored if .Sy zfs_dirty_data_max is later changed. This parameter takes precedence over .Sy zfs_dirty_data_max_max_percent . .No See Sx ZFS TRANSACTION DELAY . .Pp Defaults to .Sy min(physical_ram/4, 4GiB) , or .Sy min(physical_ram/4, 1GiB) for 32-bit systems. . .It Sy zfs_dirty_data_max_max_percent Ns = Ns Sy 25 Ns % Pq uint Maximum allowable value of .Sy zfs_dirty_data_max , expressed as a percentage of physical RAM. This limit is only enforced at module load time, and will be ignored if .Sy zfs_dirty_data_max is later changed. The parameter .Sy zfs_dirty_data_max_max takes precedence over this one. .No See Sx ZFS TRANSACTION DELAY . . .It Sy zfs_dirty_data_max_percent Ns = Ns Sy 10 Ns % Pq uint Determines the dirty space limit, expressed as a percentage of all memory. Once this limit is exceeded, new writes are halted until space frees up. The parameter .Sy zfs_dirty_data_max takes precedence over this one. .No See Sx ZFS TRANSACTION DELAY . .Pp Subject to .Sy zfs_dirty_data_max_max . . .It Sy zfs_dirty_data_sync_percent Ns = Ns Sy 20 Ns % Pq uint Start syncing out a transaction group if there's at least this much dirty data .Pq as a percentage of Sy zfs_dirty_data_max . This should be less than .Sy zfs_vdev_async_write_active_min_dirty_percent . . .It Sy zfs_wrlog_data_max Ns = Pq int The upper limit of write-transaction zil log data size in bytes. Write operations are throttled when approaching the limit until log data is cleared out after transaction group sync. Because of some overhead, it should be set at least 2 times the size of .Sy zfs_dirty_data_max .No to prevent harming normal write throughput . It also should be smaller than the size of the slog device if slog is present. .Pp Defaults to .Sy zfs_dirty_data_max*2 . .It Sy zfs_fallocate_reserve_percent Ns = Ns Sy 110 Ns % Pq uint Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be preallocated for a file in order to guarantee that later writes will not run out of space. Instead, .Xr fallocate 2 space preallocation only checks that sufficient space is currently available in the pool or the user's project quota allocation, and then creates a sparse file of the requested size. The requested space is multiplied by .Sy zfs_fallocate_reserve_percent to allow additional space for indirect blocks and other internal metadata. Setting this to .Sy 0 disables support for .Xr fallocate 2 and causes it to return .Sy EOPNOTSUPP . . .It Sy zfs_fletcher_4_impl Ns = Ns Sy fastest Pq string Select a fletcher 4 implementation. .Pp Supported selectors are: .Sy fastest , scalar , sse2 , ssse3 , avx2 , avx512f , avx512bw , .No and Sy aarch64_neon . All except .Sy fastest No and Sy scalar require instruction set extensions to be available, and will only appear if ZFS detects that they are present at runtime. If multiple implementations of fletcher 4 are available, the .Sy fastest will be chosen using a micro benchmark. Selecting .Sy scalar results in the original CPU-based calculation being used. Selecting any option other than .Sy fastest No or Sy scalar results in vector instructions from the respective CPU instruction set being used. . .It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string Select a BLAKE3 implementation. .Pp Supported selectors are: .Sy cycle , fastest , generic , sse2 , sse41 , avx2 , avx512 . All except .Sy cycle , fastest No and Sy generic require instruction set extensions to be available, and will only appear if ZFS detects that they are present at runtime. If multiple implementations of BLAKE3 are available, the .Sy fastest will be chosen using a micro benchmark. You can see the benchmark results by reading this kstat file: .Pa /proc/spl/kstat/zfs/chksum_bench . . .It Sy zfs_free_bpobj_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable/disable the processing of the free_bpobj object. . .It Sy zfs_async_block_max_blocks Ns = Ns Sy UINT64_MAX Po unlimited Pc Pq u64 Maximum number of blocks freed in a single TXG. . .It Sy zfs_max_async_dedup_frees Ns = Ns Sy 100000 Po 10^5 Pc Pq u64 Maximum number of dedup blocks freed in a single TXG. . .It Sy zfs_vdev_async_read_max_active Ns = Ns Sy 3 Pq uint Maximum asynchronous read I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_async_read_min_active Ns = Ns Sy 1 Pq uint Minimum asynchronous read I/O operation active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_async_write_active_max_dirty_percent Ns = Ns Sy 60 Ns % Pq uint When the pool has more than this much dirty data, use .Sy zfs_vdev_async_write_max_active to limit active async writes. If the dirty data is between the minimum and maximum, the active I/O limit is linearly interpolated. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_async_write_active_min_dirty_percent Ns = Ns Sy 30 Ns % Pq uint When the pool has less than this much dirty data, use .Sy zfs_vdev_async_write_min_active to limit active async writes. If the dirty data is between the minimum and maximum, the active I/O limit is linearly interpolated. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_async_write_max_active Ns = Ns Sy 10 Pq uint Maximum asynchronous write I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_async_write_min_active Ns = Ns Sy 2 Pq uint Minimum asynchronous write I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . .Pp Lower values are associated with better latency on rotational media but poorer resilver performance. The default value of .Sy 2 was chosen as a compromise. A value of .Sy 3 has been shown to improve resilver performance further at a cost of further increasing latency. . .It Sy zfs_vdev_initializing_max_active Ns = Ns Sy 1 Pq uint Maximum initializing I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_initializing_min_active Ns = Ns Sy 1 Pq uint Minimum initializing I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_max_active Ns = Ns Sy 1000 Pq uint The maximum number of I/O operations active to each device. Ideally, this will be at least the sum of each queue's .Sy max_active . .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_open_timeout_ms Ns = Ns Sy 1000 Pq uint Timeout value to wait before determining a device is missing during import. This is helpful for transient missing paths due to links being briefly removed and recreated in response to udev events. . .It Sy zfs_vdev_rebuild_max_active Ns = Ns Sy 3 Pq uint Maximum sequential resilver I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_rebuild_min_active Ns = Ns Sy 1 Pq uint Minimum sequential resilver I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_removal_max_active Ns = Ns Sy 2 Pq uint Maximum removal I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_removal_min_active Ns = Ns Sy 1 Pq uint Minimum removal I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_scrub_max_active Ns = Ns Sy 2 Pq uint Maximum scrub I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_scrub_min_active Ns = Ns Sy 1 Pq uint Minimum scrub I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_sync_read_max_active Ns = Ns Sy 10 Pq uint Maximum synchronous read I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_sync_read_min_active Ns = Ns Sy 10 Pq uint Minimum synchronous read I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_sync_write_max_active Ns = Ns Sy 10 Pq uint Maximum synchronous write I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_sync_write_min_active Ns = Ns Sy 10 Pq uint Minimum synchronous write I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_trim_max_active Ns = Ns Sy 2 Pq uint Maximum trim/discard I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_trim_min_active Ns = Ns Sy 1 Pq uint Minimum trim/discard I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_nia_delay Ns = Ns Sy 5 Pq uint For non-interactive I/O (scrub, resilver, removal, initialize and rebuild), the number of concurrently-active I/O operations is limited to .Sy zfs_*_min_active , unless the vdev is "idle". When there are no interactive I/O operations active (synchronous or otherwise), and .Sy zfs_vdev_nia_delay operations have completed since the last interactive operation, then the vdev is considered to be "idle", and the number of concurrently-active non-interactive operations is increased to .Sy zfs_*_max_active . .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_nia_credit Ns = Ns Sy 5 Pq uint Some HDDs tend to prioritize sequential I/O so strongly, that concurrent random I/O latency reaches several seconds. On some HDDs this happens even if sequential I/O operations are submitted one at a time, and so setting .Sy zfs_*_max_active Ns = Sy 1 does not help. To prevent non-interactive I/O, like scrub, from monopolizing the device, no more than .Sy zfs_vdev_nia_credit operations can be sent while there are outstanding incomplete interactive operations. This enforced wait ensures the HDD services the interactive I/O within a reasonable amount of time. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_queue_depth_pct Ns = Ns Sy 1000 Ns % Pq uint Maximum number of queued allocations per top-level vdev expressed as a percentage of .Sy zfs_vdev_async_write_max_active , which allows the system to detect devices that are more capable of handling allocations and to allocate more blocks to those devices. This allows for dynamic allocation distribution when devices are imbalanced, as fuller devices will tend to be slower than empty devices. .Pp Also see .Sy zio_dva_throttle_enabled . . .It Sy zfs_vdev_def_queue_depth Ns = Ns Sy 32 Pq uint Default queue depth for each vdev IO allocator. Higher values allow for better coalescing of sequential writes before sending them to the disk, but can increase transaction commit times. . .It Sy zfs_vdev_failfast_mask Ns = Ns Sy 1 Pq uint Defines if the driver should retire on a given error type. The following options may be bitwise-ored together: .TS box; lbz r l l . Value Name Description _ 1 Device No driver retries on device errors 2 Transport No driver retries on transport errors. 4 Driver No driver retries on driver errors. .TE . .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int Time before expiring .Pa .zfs/snapshot . . .It Sy zfs_admin_snapshot Ns = Ns Sy 0 Ns | Ns 1 Pq int Allow the creation, removal, or renaming of entries in the .Sy .zfs/snapshot directory to cause the creation, destruction, or renaming of snapshots. When enabled, this functionality works both locally and over NFS exports which have the .Em no_root_squash option set. . .It Sy zfs_flags Ns = Ns Sy 0 Pq int Set additional debugging flags. The following flags may be bitwise-ored together: .TS box; lbz r l l . Value Name Description _ 1 ZFS_DEBUG_DPRINTF Enable dprintf entries in the debug log. * 2 ZFS_DEBUG_DBUF_VERIFY Enable extra dbuf verifications. * 4 ZFS_DEBUG_DNODE_VERIFY Enable extra dnode verifications. 8 ZFS_DEBUG_SNAPNAMES Enable snapshot name verification. * 16 ZFS_DEBUG_MODIFY Check for illegally modified ARC buffers. 64 ZFS_DEBUG_ZIO_FREE Enable verification of block frees. 128 ZFS_DEBUG_HISTOGRAM_VERIFY Enable extra spacemap histogram verifications. 256 ZFS_DEBUG_METASLAB_VERIFY Verify space accounting on disk matches in-memory \fBrange_trees\fP. 512 ZFS_DEBUG_SET_ERROR Enable \fBSET_ERROR\fP and dprintf entries in the debug log. 1024 ZFS_DEBUG_INDIRECT_REMAP Verify split blocks created by device removal. 2048 ZFS_DEBUG_TRIM Verify TRIM ranges are always within the allocatable range tree. 4096 ZFS_DEBUG_LOG_SPACEMAP Verify that the log summary is consistent with the spacemap log and enable \fBzfs_dbgmsgs\fP for metaslab loading and flushing. .TE .Sy \& * No Requires debug build . . .It Sy zfs_btree_verify_intensity Ns = Ns Sy 0 Pq uint Enables btree verification. The following settings are culminative: .TS box; lbz r l l . Value Description 1 Verify height. 2 Verify pointers from children to parent. 3 Verify element counts. 4 Verify element order. (expensive) * 5 Verify unused memory is poisoned. (expensive) .TE .Sy \& * No Requires debug build . . .It Sy zfs_free_leak_on_eio Ns = Ns Sy 0 Ns | Ns 1 Pq int If destroy encounters an .Sy EIO while reading metadata (e.g. indirect blocks), space referenced by the missing metadata can not be freed. Normally this causes the background destroy to become "stalled", as it is unable to make forward progress. While in this stalled state, all remaining space to free from the error-encountering filesystem is "temporarily leaked". Set this flag to cause it to ignore the .Sy EIO , permanently leak the space from indirect blocks that can not be read, and continue to free everything else that it can. .Pp The default "stalling" behavior is useful if the storage partially fails (i.e. some but not all I/O operations fail), and then later recovers. In this case, we will be able to continue pool operations while it is partially failed, and when it recovers, we can continue to free the space, with no leaks. Note, however, that this case is actually fairly rare. .Pp Typically pools either .Bl -enum -compact -offset 4n -width "1." .It fail completely (but perhaps temporarily, e.g. due to a top-level vdev going offline), or .It have localized, permanent errors (e.g. disk returns the wrong data due to bit flip or firmware bug). .El In the former case, this setting does not matter because the pool will be suspended and the sync thread will not be able to make forward progress regardless. In the latter, because the error is permanent, the best we can do is leak the minimum amount of space, which is what setting this flag will do. It is therefore reasonable for this flag to normally be set, but we chose the more conservative approach of not setting it, so that there is no possibility of leaking space in the "partial temporary" failure case. . .It Sy zfs_free_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1s Pc Pq uint During a .Nm zfs Cm destroy operation using the .Sy async_destroy feature, a minimum of this much time will be spent working on freeing blocks per TXG. . .It Sy zfs_obsolete_min_time_ms Ns = Ns Sy 500 Ns ms Pq uint Similar to .Sy zfs_free_min_time_ms , but for cleanup of old indirection records for removed vdevs. . .It Sy zfs_immediate_write_sz Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq s64 Largest data block to write to the ZIL. Larger blocks will be treated as if the dataset being written to had the .Sy logbias Ns = Ns Sy throughput property set. . .It Sy zfs_initialize_value Ns = Ns Sy 16045690984833335022 Po 0xDEADBEEFDEADBEEE Pc Pq u64 Pattern written to vdev free space by .Xr zpool-initialize 8 . . .It Sy zfs_initialize_chunk_size Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 Size of writes used by .Xr zpool-initialize 8 . This option is used by the test suite. . .It Sy zfs_livelist_max_entries Ns = Ns Sy 500000 Po 5*10^5 Pc Pq u64 The threshold size (in block pointers) at which we create a new sub-livelist. Larger sublists are more costly from a memory perspective but the fewer sublists there are, the lower the cost of insertion. . .It Sy zfs_livelist_min_percent_shared Ns = Ns Sy 75 Ns % Pq int If the amount of shared space between a snapshot and its clone drops below this threshold, the clone turns off the livelist and reverts to the old deletion method. This is in place because livelists no long give us a benefit once a clone has been overwritten enough. . .It Sy zfs_livelist_condense_new_alloc Ns = Ns Sy 0 Pq int Incremented each time an extra ALLOC blkptr is added to a livelist entry while it is being condensed. This option is used by the test suite to track race conditions. . .It Sy zfs_livelist_condense_sync_cancel Ns = Ns Sy 0 Pq int Incremented each time livelist condensing is canceled while in .Fn spa_livelist_condense_sync . This option is used by the test suite to track race conditions. . .It Sy zfs_livelist_condense_sync_pause Ns = Ns Sy 0 Ns | Ns 1 Pq int When set, the livelist condense process pauses indefinitely before executing the synctask \(em .Fn spa_livelist_condense_sync . This option is used by the test suite to trigger race conditions. . .It Sy zfs_livelist_condense_zthr_cancel Ns = Ns Sy 0 Pq int Incremented each time livelist condensing is canceled while in .Fn spa_livelist_condense_cb . This option is used by the test suite to track race conditions. . .It Sy zfs_livelist_condense_zthr_pause Ns = Ns Sy 0 Ns | Ns 1 Pq int When set, the livelist condense process pauses indefinitely before executing the open context condensing work in .Fn spa_livelist_condense_cb . This option is used by the test suite to trigger race conditions. . .It Sy zfs_lua_max_instrlimit Ns = Ns Sy 100000000 Po 10^8 Pc Pq u64 The maximum execution time limit that can be set for a ZFS channel program, specified as a number of Lua instructions. . .It Sy zfs_lua_max_memlimit Ns = Ns Sy 104857600 Po 100 MiB Pc Pq u64 The maximum memory limit that can be set for a ZFS channel program, specified in bytes. . .It Sy zfs_max_dataset_nesting Ns = Ns Sy 50 Pq int The maximum depth of nested datasets. This value can be tuned temporarily to fix existing datasets that exceed the predefined limit. . .It Sy zfs_max_log_walking Ns = Ns Sy 5 Pq u64 The number of past TXGs that the flushing algorithm of the log spacemap feature uses to estimate incoming log blocks. . .It Sy zfs_max_logsm_summary_length Ns = Ns Sy 10 Pq u64 Maximum number of rows allowed in the summary of the spacemap log. . .It Sy zfs_max_recordsize Ns = Ns Sy 16777216 Po 16 MiB Pc Pq uint We currently support block sizes from .Em 512 Po 512 B Pc No to Em 16777216 Po 16 MiB Pc . The benefits of larger blocks, and thus larger I/O, need to be weighed against the cost of COWing a giant block to modify one byte. Additionally, very large blocks can have an impact on I/O latency, and also potentially on the memory allocator. Therefore, we formerly forbade creating blocks larger than 1M. Larger blocks could be created by changing it, and pools with larger blocks can always be imported and used, regardless of this setting. . .It Sy zfs_allow_redacted_dataset_mount Ns = Ns Sy 0 Ns | Ns 1 Pq int Allow datasets received with redacted send/receive to be mounted. Normally disabled because these datasets may be missing key data. . .It Sy zfs_min_metaslabs_to_flush Ns = Ns Sy 1 Pq u64 Minimum number of metaslabs to flush per dirty TXG. . .It Sy zfs_metaslab_fragmentation_threshold Ns = Ns Sy 70 Ns % Pq uint Allow metaslabs to keep their active state as long as their fragmentation percentage is no more than this value. An active metaslab that exceeds this threshold will no longer keep its active status allowing better metaslabs to be selected. . .It Sy zfs_mg_fragmentation_threshold Ns = Ns Sy 95 Ns % Pq uint Metaslab groups are considered eligible for allocations if their fragmentation metric (measured as a percentage) is less than or equal to this value. If a metaslab group exceeds this threshold then it will be skipped unless all metaslab groups within the metaslab class have also crossed this threshold. . .It Sy zfs_mg_noalloc_threshold Ns = Ns Sy 0 Ns % Pq uint Defines a threshold at which metaslab groups should be eligible for allocations. The value is expressed as a percentage of free space beyond which a metaslab group is always eligible for allocations. If a metaslab group's free space is less than or equal to the threshold, the allocator will avoid allocating to that group unless all groups in the pool have reached the threshold. Once all groups have reached the threshold, all groups are allowed to accept allocations. The default value of .Sy 0 disables the feature and causes all metaslab groups to be eligible for allocations. .Pp This parameter allows one to deal with pools having heavily imbalanced vdevs such as would be the case when a new vdev has been added. Setting the threshold to a non-zero percentage will stop allocations from being made to vdevs that aren't filled to the specified percentage and allow lesser filled vdevs to acquire more allocations than they otherwise would under the old .Sy zfs_mg_alloc_failures facility. . .It Sy zfs_ddt_data_is_special Ns = Ns Sy 1 Ns | Ns 0 Pq int If enabled, ZFS will place DDT data into the special allocation class. . .It Sy zfs_user_indirect_is_special Ns = Ns Sy 1 Ns | Ns 0 Pq int If enabled, ZFS will place user data indirect blocks into the special allocation class. . .It Sy zfs_multihost_history Ns = Ns Sy 0 Pq uint Historical statistics for this many latest multihost updates will be available in .Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /multihost . . .It Sy zfs_multihost_interval Ns = Ns Sy 1000 Ns ms Po 1 s Pc Pq u64 Used to control the frequency of multihost writes which are performed when the .Sy multihost pool property is on. This is one of the factors used to determine the length of the activity check during import. .Pp The multihost write period is .Sy zfs_multihost_interval No / Sy leaf-vdevs . On average a multihost write will be issued for each leaf vdev every .Sy zfs_multihost_interval milliseconds. In practice, the observed period can vary with the I/O load and this observed value is the delay which is stored in the uberblock. . .It Sy zfs_multihost_import_intervals Ns = Ns Sy 20 Pq uint Used to control the duration of the activity test on import. Smaller values of .Sy zfs_multihost_import_intervals will reduce the import time but increase the risk of failing to detect an active pool. The total activity check time is never allowed to drop below one second. .Pp On import the activity check waits a minimum amount of time determined by .Sy zfs_multihost_interval No \(mu Sy zfs_multihost_import_intervals , or the same product computed on the host which last had the pool imported, whichever is greater. The activity check time may be further extended if the value of MMP delay found in the best uberblock indicates actual multihost updates happened at longer intervals than .Sy zfs_multihost_interval . A minimum of .Em 100 ms is enforced. .Pp .Sy 0 No is equivalent to Sy 1 . . .It Sy zfs_multihost_fail_intervals Ns = Ns Sy 10 Pq uint Controls the behavior of the pool when multihost write failures or delays are detected. .Pp When .Sy 0 , multihost write failures or delays are ignored. The failures will still be reported to the ZED which depending on its configuration may take action such as suspending the pool or offlining a device. .Pp Otherwise, the pool will be suspended if .Sy zfs_multihost_fail_intervals No \(mu Sy zfs_multihost_interval milliseconds pass without a successful MMP write. This guarantees the activity test will see MMP writes if the pool is imported. .Sy 1 No is equivalent to Sy 2 ; this is necessary to prevent the pool from being suspended due to normal, small I/O latency variations. . .It Sy zfs_no_scrub_io Ns = Ns Sy 0 Ns | Ns 1 Pq int Set to disable scrub I/O. This results in scrubs not actually scrubbing data and simply doing a metadata crawl of the pool instead. . .It Sy zfs_no_scrub_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int Set to disable block prefetching for scrubs. . .It Sy zfs_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable cache flush operations on disks when writing. Setting this will cause pool corruption on power loss if a volatile out-of-order write cache is enabled. . .It Sy zfs_nopwrite_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Allow no-operation writes. The occurrence of nopwrites will further depend on other pool properties .Pq i.a. the checksumming and compression algorithms . . .It Sy zfs_dmu_offset_next_sync Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable forcing TXG sync to find holes. When enabled forces ZFS to sync data when .Sy SEEK_HOLE No or Sy SEEK_DATA flags are used allowing holes in a file to be accurately reported. When disabled holes will not be reported in recently dirtied files. . .It Sy zfs_pd_bytes_max Ns = Ns Sy 52428800 Ns B Po 50 MiB Pc Pq int The number of bytes which should be prefetched during a pool traversal, like .Nm zfs Cm send or other data crawling operations. . .It Sy zfs_traverse_indirect_prefetch_limit Ns = Ns Sy 32 Pq uint The number of blocks pointed by indirect (non-L0) block which should be prefetched during a pool traversal, like .Nm zfs Cm send or other data crawling operations. . .It Sy zfs_per_txg_dirty_frees_percent Ns = Ns Sy 30 Ns % Pq u64 Control percentage of dirtied indirect blocks from frees allowed into one TXG. After this threshold is crossed, additional frees will wait until the next TXG. .Sy 0 No disables this throttle . . .It Sy zfs_prefetch_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable predictive prefetch. Note that it leaves "prescient" prefetch .Pq for, e.g., Nm zfs Cm send intact. Unlike predictive prefetch, prescient prefetch never issues I/O that ends up not being needed, so it can't hurt performance. . .It Sy zfs_qat_checksum_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable QAT hardware acceleration for SHA256 checksums. May be unset after the ZFS modules have been loaded to initialize the QAT hardware as long as support is compiled in and the QAT driver is present. . .It Sy zfs_qat_compress_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable QAT hardware acceleration for gzip compression. May be unset after the ZFS modules have been loaded to initialize the QAT hardware as long as support is compiled in and the QAT driver is present. . .It Sy zfs_qat_encrypt_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable QAT hardware acceleration for AES-GCM encryption. May be unset after the ZFS modules have been loaded to initialize the QAT hardware as long as support is compiled in and the QAT driver is present. . .It Sy zfs_vnops_read_chunk_size Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 Bytes to read per chunk. . .It Sy zfs_read_history Ns = Ns Sy 0 Pq uint Historical statistics for this many latest reads will be available in .Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /reads . . .It Sy zfs_read_history_hits Ns = Ns Sy 0 Ns | Ns 1 Pq int Include cache hits in read history . .It Sy zfs_rebuild_max_segment Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 Maximum read segment size to issue when sequentially resilvering a top-level vdev. . .It Sy zfs_rebuild_scrub_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Automatically start a pool scrub when the last active sequential resilver completes in order to verify the checksums of all blocks which have been resilvered. This is enabled by default and strongly recommended. . .It Sy zfs_rebuild_vdev_limit Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64 Maximum amount of I/O that can be concurrently issued for a sequential resilver per leaf device, given in bytes. . .It Sy zfs_reconstruct_indirect_combinations_max Ns = Ns Sy 4096 Pq int If an indirect split block contains more than this many possible unique combinations when being reconstructed, consider it too computationally expensive to check them all. Instead, try at most this many randomly selected combinations each time the block is accessed. This allows all segment copies to participate fairly in the reconstruction when all combinations cannot be checked and prevents repeated use of one bad copy. . .It Sy zfs_recover Ns = Ns Sy 0 Ns | Ns 1 Pq int Set to attempt to recover from fatal errors. This should only be used as a last resort, as it typically results in leaked space, or worse. . .It Sy zfs_removal_ignore_errors Ns = Ns Sy 0 Ns | Ns 1 Pq int Ignore hard I/O errors during device removal. When set, if a device encounters a hard I/O error during the removal process the removal will not be cancelled. This can result in a normally recoverable block becoming permanently damaged and is hence not recommended. This should only be used as a last resort when the pool cannot be returned to a healthy state prior to removing the device. . .It Sy zfs_removal_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq uint This is used by the test suite so that it can ensure that certain actions happen while in the middle of a removal. . .It Sy zfs_remove_max_segment Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint The largest contiguous segment that we will attempt to allocate when removing a device. If there is a performance problem with attempting to allocate large blocks, consider decreasing this. The default value is also the maximum. . .It Sy zfs_resilver_disable_defer Ns = Ns Sy 0 Ns | Ns 1 Pq int Ignore the .Sy resilver_defer feature, causing an operation that would start a resilver to immediately restart the one in progress. . .It Sy zfs_resilver_min_time_ms Ns = Ns Sy 3000 Ns ms Po 3 s Pc Pq uint Resilvers are processed by the sync thread. While resilvering, it will spend at least this much time working on a resilver between TXG flushes. . .It Sy zfs_scan_ignore_errors Ns = Ns Sy 0 Ns | Ns 1 Pq int If set, remove the DTL (dirty time list) upon completion of a pool scan (scrub), even if there were unrepairable errors. Intended to be used during pool repair or recovery to stop resilvering when the pool is next imported. . .It Sy zfs_scrub_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1 s Pc Pq uint Scrubs are processed by the sync thread. While scrubbing, it will spend at least this much time working on a scrub between TXG flushes. . .It Sy zfs_scrub_error_blocks_per_txg Ns = Ns Sy 4096 Pq uint Error blocks to be scrubbed in one txg. . .It Sy zfs_scan_checkpoint_intval Ns = Ns Sy 7200 Ns s Po 2 hour Pc Pq uint To preserve progress across reboots, the sequential scan algorithm periodically needs to stop metadata scanning and issue all the verification I/O to disk. The frequency of this flushing is determined by this tunable. . .It Sy zfs_scan_fill_weight Ns = Ns Sy 3 Pq uint This tunable affects how scrub and resilver I/O segments are ordered. A higher number indicates that we care more about how filled in a segment is, while a lower number indicates we care more about the size of the extent without considering the gaps within a segment. This value is only tunable upon module insertion. Changing the value afterwards will have no effect on scrub or resilver performance. . .It Sy zfs_scan_issue_strategy Ns = Ns Sy 0 Pq uint Determines the order that data will be verified while scrubbing or resilvering: .Bl -tag -compact -offset 4n -width "a" .It Sy 1 Data will be verified as sequentially as possible, given the amount of memory reserved for scrubbing .Pq see Sy zfs_scan_mem_lim_fact . This may improve scrub performance if the pool's data is very fragmented. .It Sy 2 The largest mostly-contiguous chunk of found data will be verified first. By deferring scrubbing of small segments, we may later find adjacent data to coalesce and increase the segment size. .It Sy 0 .No Use strategy Sy 1 No during normal verification .No and strategy Sy 2 No while taking a checkpoint . .El . .It Sy zfs_scan_legacy Ns = Ns Sy 0 Ns | Ns 1 Pq int If unset, indicates that scrubs and resilvers will gather metadata in memory before issuing sequential I/O. Otherwise indicates that the legacy algorithm will be used, where I/O is initiated as soon as it is discovered. Unsetting will not affect scrubs or resilvers that are already in progress. . .It Sy zfs_scan_max_ext_gap Ns = Ns Sy 2097152 Ns B Po 2 MiB Pc Pq int Sets the largest gap in bytes between scrub/resilver I/O operations that will still be considered sequential for sorting purposes. Changing this value will not affect scrubs or resilvers that are already in progress. . .It Sy zfs_scan_mem_lim_fact Ns = Ns Sy 20 Ns ^-1 Pq uint Maximum fraction of RAM used for I/O sorting by sequential scan algorithm. This tunable determines the hard limit for I/O sorting memory usage. When the hard limit is reached we stop scanning metadata and start issuing data verification I/O. This is done until we get below the soft limit. . .It Sy zfs_scan_mem_lim_soft_fact Ns = Ns Sy 20 Ns ^-1 Pq uint The fraction of the hard limit used to determined the soft limit for I/O sorting by the sequential scan algorithm. When we cross this limit from below no action is taken. When we cross this limit from above it is because we are issuing verification I/O. In this case (unless the metadata scan is done) we stop issuing verification I/O and start scanning metadata again until we get to the hard limit. . .It Sy zfs_scan_report_txgs Ns = Ns Sy 0 Ns | Ns 1 Pq uint When reporting resilver throughput and estimated completion time use the performance observed over roughly the last .Sy zfs_scan_report_txgs TXGs. When set to zero performance is calculated over the time between checkpoints. . .It Sy zfs_scan_strict_mem_lim Ns = Ns Sy 0 Ns | Ns 1 Pq int Enforce tight memory limits on pool scans when a sequential scan is in progress. When disabled, the memory limit may be exceeded by fast disks. . .It Sy zfs_scan_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq int Freezes a scrub/resilver in progress without actually pausing it. Intended for testing/debugging. . .It Sy zfs_scan_vdev_limit Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int Maximum amount of data that can be concurrently issued at once for scrubs and resilvers per leaf device, given in bytes. . .It Sy zfs_send_corrupt_data Ns = Ns Sy 0 Ns | Ns 1 Pq int Allow sending of corrupt data (ignore read/checksum errors when sending). . .It Sy zfs_send_unmodified_spill_blocks Ns = Ns Sy 1 Ns | Ns 0 Pq int Include unmodified spill blocks in the send stream. Under certain circumstances, previous versions of ZFS could incorrectly remove the spill block from an existing object. Including unmodified copies of the spill blocks creates a backwards-compatible stream which will recreate a spill block if it was incorrectly removed. . .It Sy zfs_send_no_prefetch_queue_ff Ns = Ns Sy 20 Ns ^\-1 Pq uint The fill fraction of the .Nm zfs Cm send internal queues. The fill fraction controls the timing with which internal threads are woken up. . .It Sy zfs_send_no_prefetch_queue_length Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint The maximum number of bytes allowed in .Nm zfs Cm send Ns 's internal queues. . .It Sy zfs_send_queue_ff Ns = Ns Sy 20 Ns ^\-1 Pq uint The fill fraction of the .Nm zfs Cm send prefetch queue. The fill fraction controls the timing with which internal threads are woken up. . .It Sy zfs_send_queue_length Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint The maximum number of bytes allowed that will be prefetched by .Nm zfs Cm send . This value must be at least twice the maximum block size in use. . .It Sy zfs_recv_queue_ff Ns = Ns Sy 20 Ns ^\-1 Pq uint The fill fraction of the .Nm zfs Cm receive queue. The fill fraction controls the timing with which internal threads are woken up. . .It Sy zfs_recv_queue_length Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint The maximum number of bytes allowed in the .Nm zfs Cm receive queue. This value must be at least twice the maximum block size in use. . .It Sy zfs_recv_write_batch_size Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint The maximum amount of data, in bytes, that .Nm zfs Cm receive will write in one DMU transaction. This is the uncompressed size, even when receiving a compressed send stream. This setting will not reduce the write size below a single block. Capped at a maximum of .Sy 32 MiB . . .It Sy zfs_recv_best_effort_corrective Ns = Ns Sy 0 Pq int When this variable is set to non-zero a corrective receive: .Bl -enum -compact -offset 4n -width "1." .It Does not enforce the restriction of source & destination snapshot GUIDs matching. .It If there is an error during healing, the healing receive is not terminated instead it moves on to the next record. .El . .It Sy zfs_override_estimate_recordsize Ns = Ns Sy 0 Ns | Ns 1 Pq uint Setting this variable overrides the default logic for estimating block sizes when doing a .Nm zfs Cm send . The default heuristic is that the average block size will be the current recordsize. Override this value if most data in your dataset is not of that size and you require accurate zfs send size estimates. . .It Sy zfs_sync_pass_deferred_free Ns = Ns Sy 2 Pq uint Flushing of data to disk is done in passes. Defer frees starting in this pass. . .It Sy zfs_spa_discard_memory_limit Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int Maximum memory used for prefetching a checkpoint's space map on each vdev while discarding the checkpoint. . .It Sy zfs_special_class_metadata_reserve_pct Ns = Ns Sy 25 Ns % Pq uint Only allow small data blocks to be allocated on the special and dedup vdev types when the available free space percentage on these vdevs exceeds this value. This ensures reserved space is available for pool metadata as the special vdevs approach capacity. . .It Sy zfs_sync_pass_dont_compress Ns = Ns Sy 8 Pq uint Starting in this sync pass, disable compression (including of metadata). With the default setting, in practice, we don't have this many sync passes, so this has no effect. .Pp The original intent was that disabling compression would help the sync passes to converge. However, in practice, disabling compression increases the average number of sync passes; because when we turn compression off, many blocks' size will change, and thus we have to re-allocate (not overwrite) them. It also increases the number of .Em 128 KiB allocations (e.g. for indirect blocks and spacemaps) because these will not be compressed. The .Em 128 KiB allocations are especially detrimental to performance on highly fragmented systems, which may have very few free segments of this size, and may need to load new metaslabs to satisfy these allocations. . .It Sy zfs_sync_pass_rewrite Ns = Ns Sy 2 Pq uint Rewrite new block pointers starting in this pass. . .It Sy zfs_sync_taskq_batch_pct Ns = Ns Sy 75 Ns % Pq int This controls the number of threads used by .Sy dp_sync_taskq . The default value of .Sy 75% will create a maximum of one thread per CPU. . .It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint Maximum size of TRIM command. Larger ranges will be split into chunks no larger than this value before issuing. . .It Sy zfs_trim_extent_bytes_min Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint Minimum size of TRIM commands. TRIM ranges smaller than this will be skipped, unless they're part of a larger range which was chunked. This is done because it's common for these small TRIMs to negatively impact overall performance. . .It Sy zfs_trim_metaslab_skip Ns = Ns Sy 0 Ns | Ns 1 Pq uint Skip uninitialized metaslabs during the TRIM process. This option is useful for pools constructed from large thinly-provisioned devices where TRIM operations are slow. As a pool ages, an increasing fraction of the pool's metaslabs will be initialized, progressively degrading the usefulness of this option. This setting is stored when starting a manual TRIM and will persist for the duration of the requested TRIM. . .It Sy zfs_trim_queue_limit Ns = Ns Sy 10 Pq uint Maximum number of queued TRIMs outstanding per leaf vdev. The number of concurrent TRIM commands issued to the device is controlled by .Sy zfs_vdev_trim_min_active No and Sy zfs_vdev_trim_max_active . . .It Sy zfs_trim_txg_batch Ns = Ns Sy 32 Pq uint The number of transaction groups' worth of frees which should be aggregated before TRIM operations are issued to the device. This setting represents a trade-off between issuing larger, more efficient TRIM operations and the delay before the recently trimmed space is available for use by the device. .Pp Increasing this value will allow frees to be aggregated for a longer time. This will result is larger TRIM operations and potentially increased memory usage. Decreasing this value will have the opposite effect. The default of .Sy 32 was determined to be a reasonable compromise. . .It Sy zfs_txg_history Ns = Ns Sy 0 Pq uint Historical statistics for this many latest TXGs will be available in .Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /TXGs . . .It Sy zfs_txg_timeout Ns = Ns Sy 5 Ns s Pq uint Flush dirty data to disk at least every this many seconds (maximum TXG duration). . .It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint Max vdev I/O aggregation size. . .It Sy zfs_vdev_aggregation_limit_non_rotating Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint Max vdev I/O aggregation size for non-rotating media. . .It Sy zfs_vdev_mirror_rotating_inc Ns = Ns Sy 0 Pq int A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O operation immediately follows its predecessor on rotational vdevs for the purpose of making decisions based on load. . .It Sy zfs_vdev_mirror_rotating_seek_inc Ns = Ns Sy 5 Pq int A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O operation lacks locality as defined by .Sy zfs_vdev_mirror_rotating_seek_offset . Operations within this that are not immediately following the previous operation are incremented by half. . .It Sy zfs_vdev_mirror_rotating_seek_offset Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq int The maximum distance for the last queued I/O operation in which the balancing algorithm considers an operation to have locality. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_mirror_non_rotating_inc Ns = Ns Sy 0 Pq int A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member on non-rotational vdevs when I/O operations do not immediately follow one another. . .It Sy zfs_vdev_mirror_non_rotating_seek_inc Ns = Ns Sy 1 Pq int A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O operation lacks locality as defined by the .Sy zfs_vdev_mirror_rotating_seek_offset . Operations within this that are not immediately following the previous operation are incremented by half. . .It Sy zfs_vdev_read_gap_limit Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint Aggregate read I/O operations if the on-disk gap between them is within this threshold. . .It Sy zfs_vdev_write_gap_limit Ns = Ns Sy 4096 Ns B Po 4 KiB Pc Pq uint Aggregate write I/O operations if the on-disk gap between them is within this threshold. . .It Sy zfs_vdev_raidz_impl Ns = Ns Sy fastest Pq string Select the raidz parity implementation to use. .Pp Variants that don't depend on CPU-specific features may be selected on module load, as they are supported on all systems. The remaining options may only be set after the module is loaded, as they are available only if the implementations are compiled in and supported on the running system. .Pp Once the module is loaded, .Pa /sys/module/zfs/parameters/zfs_vdev_raidz_impl will show the available options, with the currently selected one enclosed in square brackets. .Pp .TS lb l l . fastest selected by built-in benchmark original original implementation scalar scalar implementation sse2 SSE2 instruction set 64-bit x86 ssse3 SSSE3 instruction set 64-bit x86 avx2 AVX2 instruction set 64-bit x86 avx512f AVX512F instruction set 64-bit x86 avx512bw AVX512F & AVX512BW instruction sets 64-bit x86 aarch64_neon NEON Aarch64/64-bit ARMv8 aarch64_neonx2 NEON with more unrolling Aarch64/64-bit ARMv8 powerpc_altivec Altivec PowerPC .TE . .It Sy zfs_vdev_scheduler Pq charp .Sy DEPRECATED . Prints warning to kernel log for compatibility. . .It Sy zfs_zevent_len_max Ns = Ns Sy 512 Pq uint Max event queue length. Events in the queue can be viewed with .Xr zpool-events 8 . . .It Sy zfs_zevent_retain_max Ns = Ns Sy 2000 Pq int Maximum recent zevent records to retain for duplicate checking. Setting this to .Sy 0 disables duplicate detection. . .It Sy zfs_zevent_retain_expire_secs Ns = Ns Sy 900 Ns s Po 15 min Pc Pq int Lifespan for a recent ereport that was retained for duplicate checking. . .It Sy zfs_zil_clean_taskq_maxalloc Ns = Ns Sy 1048576 Pq int The maximum number of taskq entries that are allowed to be cached. When this limit is exceeded transaction records (itxs) will be cleaned synchronously. . .It Sy zfs_zil_clean_taskq_minalloc Ns = Ns Sy 1024 Pq int The number of taskq entries that are pre-populated when the taskq is first created and are immediately available for use. . .It Sy zfs_zil_clean_taskq_nthr_pct Ns = Ns Sy 100 Ns % Pq int This controls the number of threads used by .Sy dp_zil_clean_taskq . The default value of .Sy 100% will create a maximum of one thread per cpu. . .It Sy zil_maxblocksize Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint This sets the maximum block size used by the ZIL. On very fragmented pools, lowering this .Pq typically to Sy 36 KiB can improve performance. . +.It Sy zil_maxcopied Ns = Ns Sy 7680 Ns B Po 7.5 KiB Pc Pq uint +This sets the maximum number of write bytes logged via WR_COPIED. +It tunes a tradeoff between additional memory copy and possibly worse log +space efficiency vs additional range lock/unlock. +. .It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64 This sets the minimum delay in nanoseconds ZIL care to delay block commit, waiting for more records. If ZIL writes are too fast, kernel may not be able sleep for so short interval, increasing log latency above allowed by .Sy zfs_commit_timeout_pct . . .It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable the cache flush commands that are normally sent to disk by the ZIL after an LWB write has completed. Setting this will cause ZIL corruption on power loss if a volatile out-of-order write cache is enabled. . .It Sy zil_replay_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable intent logging replay. Can be disabled for recovery from corrupted ZIL. . .It Sy zil_slog_bulk Ns = Ns Sy 786432 Ns B Po 768 KiB Pc Pq u64 Limit SLOG write size per commit executed with synchronous priority. Any writes above that will be executed with lower (asynchronous) priority to limit potential SLOG device abuse by single active ZIL writer. . .It Sy zfs_zil_saxattr Ns = Ns Sy 1 Ns | Ns 0 Pq int Setting this tunable to zero disables ZIL logging of new .Sy xattr Ns = Ns Sy sa records if the .Sy org.openzfs:zilsaxattr feature is enabled on the pool. This would only be necessary to work around bugs in the ZIL logging or replay code for this record type. The tunable has no effect if the feature is disabled. . .It Sy zfs_embedded_slog_min_ms Ns = Ns Sy 64 Pq uint Usually, one metaslab from each normal-class vdev is dedicated for use by the ZIL to log synchronous writes. However, if there are fewer than .Sy zfs_embedded_slog_min_ms metaslabs in the vdev, this functionality is disabled. This ensures that we don't set aside an unreasonable amount of space for the ZIL. . .It Sy zstd_earlyabort_pass Ns = Ns Sy 1 Pq uint Whether heuristic for detection of incompressible data with zstd levels >= 3 using LZ4 and zstd-1 passes is enabled. . .It Sy zstd_abort_size Ns = Ns Sy 131072 Pq uint Minimal uncompressed size (inclusive) of a record before the early abort heuristic will be attempted. . .It Sy zio_deadman_log_all Ns = Ns Sy 0 Ns | Ns 1 Pq int If non-zero, the zio deadman will produce debugging messages .Pq see Sy zfs_dbgmsg_enable for all zios, rather than only for leaf zios possessing a vdev. This is meant to be used by developers to gain diagnostic information for hang conditions which don't involve a mutex or other locking primitive: typically conditions in which a thread in the zio pipeline is looping indefinitely. . .It Sy zio_slow_io_ms Ns = Ns Sy 30000 Ns ms Po 30 s Pc Pq int When an I/O operation takes more than this much time to complete, it's marked as slow. Each slow operation causes a delay zevent. Slow I/O counters can be seen with .Nm zpool Cm status Fl s . . .It Sy zio_dva_throttle_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Throttle block allocations in the I/O pipeline. This allows for dynamic allocation distribution when devices are imbalanced. When enabled, the maximum number of pending allocations per top-level vdev is limited by .Sy zfs_vdev_queue_depth_pct . . .It Sy zfs_xattr_compat Ns = Ns 0 Ns | Ns 1 Pq int Control the naming scheme used when setting new xattrs in the user namespace. If .Sy 0 .Pq the default on Linux , user namespace xattr names are prefixed with the namespace, to be backwards compatible with previous versions of ZFS on Linux. If .Sy 1 .Pq the default on Fx , user namespace xattr names are not prefixed, to be backwards compatible with previous versions of ZFS on illumos and .Fx . .Pp Either naming scheme can be read on this and future versions of ZFS, regardless of this tunable, but legacy ZFS on illumos or .Fx are unable to read user namespace xattrs written in the Linux format, and legacy versions of ZFS on Linux are unable to read user namespace xattrs written in the legacy ZFS format. .Pp An existing xattr with the alternate naming scheme is removed when overwriting the xattr so as to not accumulate duplicates. . .It Sy zio_requeue_io_start_cut_in_line Ns = Ns Sy 0 Ns | Ns 1 Pq int Prioritize requeued I/O. . .It Sy zio_taskq_batch_pct Ns = Ns Sy 80 Ns % Pq uint Percentage of online CPUs which will run a worker thread for I/O. These workers are responsible for I/O work such as compression and checksum calculations. Fractional number of CPUs will be rounded down. .Pp The default value of .Sy 80% was chosen to avoid using all CPUs which can result in latency issues and inconsistent application performance, especially when slower compression and/or checksumming is enabled. . .It Sy zio_taskq_batch_tpq Ns = Ns Sy 0 Pq uint Number of worker threads per taskq. Lower values improve I/O ordering and CPU utilization, while higher reduces lock contention. .Pp If .Sy 0 , generate a system-dependent value close to 6 threads per taskq. . .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint Do not create zvol device nodes. This may slightly improve startup time on systems with a very large number of zvols. . .It Sy zvol_major Ns = Ns Sy 230 Pq uint Major number for zvol block devices. . .It Sy zvol_max_discard_blocks Ns = Ns Sy 16384 Pq long Discard (TRIM) operations done on zvols will be done in batches of this many blocks, where block size is determined by the .Sy volblocksize property of a zvol. . .It Sy zvol_prefetch_bytes Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint When adding a zvol to the system, prefetch this many bytes from the start and end of the volume. Prefetching these regions of the volume is desirable, because they are likely to be accessed immediately by .Xr blkid 8 or the kernel partitioner. . .It Sy zvol_request_sync Ns = Ns Sy 0 Ns | Ns 1 Pq uint When processing I/O requests for a zvol, submit them synchronously. This effectively limits the queue depth to .Em 1 for each I/O submitter. When unset, requests are handled asynchronously by a thread pool. The number of requests which can be handled concurrently is controlled by .Sy zvol_threads . .Sy zvol_request_sync is ignored when running on a kernel that supports block multiqueue .Pq Li blk-mq . . .It Sy zvol_threads Ns = Ns Sy 0 Pq uint The number of system wide threads to use for processing zvol block IOs. If .Sy 0 (the default) then internally set .Sy zvol_threads to the number of CPUs present or 32 (whichever is greater). . .It Sy zvol_blk_mq_threads Ns = Ns Sy 0 Pq uint The number of threads per zvol to use for queuing IO requests. This parameter will only appear if your kernel supports .Li blk-mq and is only read and assigned to a zvol at zvol load time. If .Sy 0 (the default) then internally set .Sy zvol_blk_mq_threads to the number of CPUs present. . .It Sy zvol_use_blk_mq Ns = Ns Sy 0 Ns | Ns 1 Pq uint Set to .Sy 1 to use the .Li blk-mq API for zvols. Set to .Sy 0 (the default) to use the legacy zvol APIs. This setting can give better or worse zvol performance depending on the workload. This parameter will only appear if your kernel supports .Li blk-mq and is only read and assigned to a zvol at zvol load time. . .It Sy zvol_blk_mq_blocks_per_thread Ns = Ns Sy 8 Pq uint If .Sy zvol_use_blk_mq is enabled, then process this number of .Sy volblocksize Ns -sized blocks per zvol thread. This tunable can be use to favor better performance for zvol reads (lower values) or writes (higher values). If set to .Sy 0 , then the zvol layer will process the maximum number of blocks per thread that it can. This parameter will only appear if your kernel supports .Li blk-mq and is only applied at each zvol's load time. . .It Sy zvol_blk_mq_queue_depth Ns = Ns Sy 0 Pq uint The queue_depth value for the zvol .Li blk-mq interface. This parameter will only appear if your kernel supports .Li blk-mq and is only applied at each zvol's load time. If .Sy 0 (the default) then use the kernel's default queue depth. Values are clamped to the kernel's .Dv BLKDEV_MIN_RQ and .Dv BLKDEV_MAX_RQ Ns / Ns Dv BLKDEV_DEFAULT_RQ limits. . .It Sy zvol_volmode Ns = Ns Sy 1 Pq uint Defines zvol block devices behaviour when .Sy volmode Ns = Ns Sy default : .Bl -tag -compact -offset 4n -width "a" .It Sy 1 .No equivalent to Sy full .It Sy 2 .No equivalent to Sy dev .It Sy 3 .No equivalent to Sy none .El . .It Sy zvol_enforce_quotas Ns = Ns Sy 0 Ns | Ns 1 Pq uint Enable strict ZVOL quota enforcement. The strict quota enforcement may have a performance impact. .El . .Sh ZFS I/O SCHEDULER ZFS issues I/O operations to leaf vdevs to satisfy and complete I/O operations. The scheduler determines when and in what order those operations are issued. The scheduler divides operations into five I/O classes, prioritized in the following order: sync read, sync write, async read, async write, and scrub/resilver. Each queue defines the minimum and maximum number of concurrent operations that may be issued to the device. In addition, the device has an aggregate maximum, .Sy zfs_vdev_max_active . Note that the sum of the per-queue minima must not exceed the aggregate maximum. If the sum of the per-queue maxima exceeds the aggregate maximum, then the number of active operations may reach .Sy zfs_vdev_max_active , in which case no further operations will be issued, regardless of whether all per-queue minima have been met. .Pp For many physical devices, throughput increases with the number of concurrent operations, but latency typically suffers. Furthermore, physical devices typically have a limit at which more concurrent operations have no effect on throughput or can actually cause it to decrease. .Pp The scheduler selects the next operation to issue by first looking for an I/O class whose minimum has not been satisfied. Once all are satisfied and the aggregate maximum has not been hit, the scheduler looks for classes whose maximum has not been satisfied. Iteration through the I/O classes is done in the order specified above. No further operations are issued if the aggregate maximum number of concurrent operations has been hit, or if there are no operations queued for an I/O class that has not hit its maximum. Every time an I/O operation is queued or an operation completes, the scheduler looks for new operations to issue. .Pp In general, smaller .Sy max_active Ns s will lead to lower latency of synchronous operations. Larger .Sy max_active Ns s may lead to higher overall throughput, depending on underlying storage. .Pp The ratio of the queues' .Sy max_active Ns s determines the balance of performance between reads, writes, and scrubs. For example, increasing .Sy zfs_vdev_scrub_max_active will cause the scrub or resilver to complete more quickly, but reads and writes to have higher latency and lower throughput. .Pp All I/O classes have a fixed maximum number of outstanding operations, except for the async write class. Asynchronous writes represent the data that is committed to stable storage during the syncing stage for transaction groups. Transaction groups enter the syncing state periodically, so the number of queued async writes will quickly burst up and then bleed down to zero. Rather than servicing them as quickly as possible, the I/O scheduler changes the maximum number of active async write operations according to the amount of dirty data in the pool. Since both throughput and latency typically increase with the number of concurrent operations issued to physical devices, reducing the burstiness in the number of simultaneous operations also stabilizes the response time of operations from other queues, in particular synchronous ones. In broad strokes, the I/O scheduler will issue more concurrent operations from the async write queue as there is more dirty data in the pool. . .Ss Async Writes The number of concurrent operations issued for the async write I/O class follows a piece-wise linear function defined by a few adjustable points: .Bd -literal | o---------| <-- \fBzfs_vdev_async_write_max_active\fP ^ | /^ | | | / | | active | / | | I/O | / | | count | / | | | / | | |-------o | | <-- \fBzfs_vdev_async_write_min_active\fP 0|_______^______|_________| 0% | | 100% of \fBzfs_dirty_data_max\fP | | | `-- \fBzfs_vdev_async_write_active_max_dirty_percent\fP `--------- \fBzfs_vdev_async_write_active_min_dirty_percent\fP .Ed .Pp Until the amount of dirty data exceeds a minimum percentage of the dirty data allowed in the pool, the I/O scheduler will limit the number of concurrent operations to the minimum. As that threshold is crossed, the number of concurrent operations issued increases linearly to the maximum at the specified maximum percentage of the dirty data allowed in the pool. .Pp Ideally, the amount of dirty data on a busy pool will stay in the sloped part of the function between .Sy zfs_vdev_async_write_active_min_dirty_percent and .Sy zfs_vdev_async_write_active_max_dirty_percent . If it exceeds the maximum percentage, this indicates that the rate of incoming data is greater than the rate that the backend storage can handle. In this case, we must further throttle incoming writes, as described in the next section. . .Sh ZFS TRANSACTION DELAY We delay transactions when we've determined that the backend storage isn't able to accommodate the rate of incoming writes. .Pp If there is already a transaction waiting, we delay relative to when that transaction will finish waiting. This way the calculated delay time is independent of the number of threads concurrently executing transactions. .Pp If we are the only waiter, wait relative to when the transaction started, rather than the current time. This credits the transaction for "time already served", e.g. reading indirect blocks. .Pp The minimum time for a transaction to take is calculated as .D1 min_time = min( Ns Sy zfs_delay_scale No \(mu Po Sy dirty No \- Sy min Pc / Po Sy max No \- Sy dirty Pc , 100ms) .Pp The delay has two degrees of freedom that can be adjusted via tunables. The percentage of dirty data at which we start to delay is defined by .Sy zfs_delay_min_dirty_percent . This should typically be at or above .Sy zfs_vdev_async_write_active_max_dirty_percent , so that we only start to delay after writing at full speed has failed to keep up with the incoming write rate. The scale of the curve is defined by .Sy zfs_delay_scale . Roughly speaking, this variable determines the amount of delay at the midpoint of the curve. .Bd -literal delay 10ms +-------------------------------------------------------------*+ | *| 9ms + *+ | *| 8ms + *+ | * | 7ms + * + | * | 6ms + * + | * | 5ms + * + | * | 4ms + * + | * | 3ms + * + | * | 2ms + (midpoint) * + | | ** | 1ms + v *** + | \fBzfs_delay_scale\fP ----------> ******** | 0 +-------------------------------------*********----------------+ 0% <- \fBzfs_dirty_data_max\fP -> 100% .Ed .Pp Note, that since the delay is added to the outstanding time remaining on the most recent transaction it's effectively the inverse of IOPS. Here, the midpoint of .Em 500 us translates to .Em 2000 IOPS . The shape of the curve was chosen such that small changes in the amount of accumulated dirty data in the first three quarters of the curve yield relatively small differences in the amount of delay. .Pp The effects can be easier to understand when the amount of delay is represented on a logarithmic scale: .Bd -literal delay 100ms +-------------------------------------------------------------++ + + | | + *+ 10ms + *+ + ** + | (midpoint) ** | + | ** + 1ms + v **** + + \fBzfs_delay_scale\fP ----------> ***** + | **** | + **** + 100us + ** + + * + | * | + * + 10us + * + + + | | + + +--------------------------------------------------------------+ 0% <- \fBzfs_dirty_data_max\fP -> 100% .Ed .Pp Note here that only as the amount of dirty data approaches its limit does the delay start to increase rapidly. The goal of a properly tuned system should be to keep the amount of dirty data out of that range by first ensuring that the appropriate limits are set for the I/O scheduler to reach optimal throughput on the back-end storage, and then by changing the value of .Sy zfs_delay_scale to increase the steepness of the curve. diff --git a/sys/contrib/openzfs/man/man7/zfsconcepts.7 b/sys/contrib/openzfs/man/man7/zfsconcepts.7 index 18a9e9b5cafe..1be3d961c3d7 100644 --- a/sys/contrib/openzfs/man/man7/zfsconcepts.7 +++ b/sys/contrib/openzfs/man/man7/zfsconcepts.7 @@ -1,207 +1,245 @@ .\" .\" CDDL HEADER START .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or https://opensource.org/licenses/CDDL-1.0. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" CDDL HEADER END .\" .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. .\" Copyright 2011 Joshua M. Clulow .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" Copyright (c) 2014 by Adam Stevko. All rights reserved. .\" Copyright (c) 2014 Integros [integros.com] .\" Copyright 2019 Richard Laager. All rights reserved. .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. +.\" Copyright 2023 Klara, Inc. .\" -.Dd June 30, 2019 +.Dd October 6, 2023 .Dt ZFSCONCEPTS 7 .Os . .Sh NAME .Nm zfsconcepts .Nd overview of ZFS concepts . .Sh DESCRIPTION .Ss ZFS File System Hierarchy A ZFS storage pool is a logical collection of devices that provide space for datasets. A storage pool is also the root of the ZFS file system hierarchy. .Pp The root of the pool can be accessed as a file system, such as mounting and unmounting, taking snapshots, and setting properties. The physical storage characteristics, however, are managed by the .Xr zpool 8 command. .Pp See .Xr zpool 8 for more information on creating and administering pools. .Ss Snapshots A snapshot is a read-only copy of a file system or volume. Snapshots can be created extremely quickly, and initially consume no additional space within the pool. As data within the active dataset changes, the snapshot consumes more data than would otherwise be shared with the active dataset. .Pp Snapshots can have arbitrary names. Snapshots of volumes can be cloned or rolled back, visibility is determined by the .Sy snapdev property of the parent volume. .Pp File system snapshots can be accessed under the .Pa .zfs/snapshot directory in the root of the file system. Snapshots are automatically mounted on demand and may be unmounted at regular intervals. The visibility of the .Pa .zfs directory can be controlled by the .Sy snapdir property. .Ss Bookmarks A bookmark is like a snapshot, a read-only copy of a file system or volume. Bookmarks can be created extremely quickly, compared to snapshots, and they consume no additional space within the pool. Bookmarks can also have arbitrary names, much like snapshots. .Pp Unlike snapshots, bookmarks can not be accessed through the filesystem in any way. From a storage standpoint a bookmark just provides a way to reference when a snapshot was created as a distinct object. Bookmarks are initially tied to a snapshot, not the filesystem or volume, and they will survive if the snapshot itself is destroyed. Since they are very light weight there's little incentive to destroy them. .Ss Clones A clone is a writable volume or file system whose initial contents are the same as another dataset. As with snapshots, creating a clone is nearly instantaneous, and initially consumes no additional space. .Pp Clones can only be created from a snapshot. When a snapshot is cloned, it creates an implicit dependency between the parent and child. Even though the clone is created somewhere else in the dataset hierarchy, the original snapshot cannot be destroyed as long as a clone exists. The .Sy origin property exposes this dependency, and the .Cm destroy command lists any such dependencies, if they exist. .Pp The clone parent-child dependency relationship can be reversed by using the .Cm promote subcommand. This causes the .Qq origin file system to become a clone of the specified file system, which makes it possible to destroy the file system that the clone was created from. .Ss "Mount Points" Creating a ZFS file system is a simple operation, so the number of file systems per system is likely to be numerous. To cope with this, ZFS automatically manages mounting and unmounting file systems without the need to edit the .Pa /etc/fstab file. All automatically managed file systems are mounted by ZFS at boot time. .Pp By default, file systems are mounted under .Pa /path , where .Ar path is the name of the file system in the ZFS namespace. Directories are created and destroyed as needed. .Pp A file system can also have a mount point set in the .Sy mountpoint property. This directory is created as needed, and ZFS automatically mounts the file system when the .Nm zfs Cm mount Fl a command is invoked .Po without editing .Pa /etc/fstab .Pc . The .Sy mountpoint property can be inherited, so if .Em pool/home has a mount point of .Pa /export/stuff , then .Em pool/home/user automatically inherits a mount point of .Pa /export/stuff/user . .Pp A file system .Sy mountpoint property of .Sy none prevents the file system from being mounted. .Pp If needed, ZFS file systems can also be managed with traditional tools .Po .Nm mount , .Nm umount , .Pa /etc/fstab .Pc . If a file system's mount point is set to .Sy legacy , ZFS makes no attempt to manage the file system, and the administrator is responsible for mounting and unmounting the file system. Because pools must be imported before a legacy mount can succeed, administrators should ensure that legacy mounts are only attempted after the zpool import process finishes at boot time. For example, on machines using systemd, the mount option .Pp .Nm x-systemd.requires=zfs-import.target .Pp will ensure that the zfs-import completes before systemd attempts mounting the filesystem. See .Xr systemd.mount 5 for details. .Ss Deduplication Deduplication is the process for removing redundant data at the block level, reducing the total amount of data stored. If a file system has the .Sy dedup property enabled, duplicate data blocks are removed synchronously. The result is that only unique data is stored and common components are shared among files. .Pp Deduplicating data is a very resource-intensive operation. It is generally recommended that you have at least 1.25 GiB of RAM per 1 TiB of storage when you enable deduplication. Calculating the exact requirement depends heavily on the type of data stored in the pool. .Pp Enabling deduplication on an improperly-designed system can result in performance issues (slow I/O and administrative operations). It can potentially lead to problems importing a pool due to memory exhaustion. Deduplication can consume significant processing power (CPU) and memory as well as generate additional disk I/O. .Pp Before creating a pool with deduplication enabled, ensure that you have planned your hardware requirements appropriately and implemented appropriate recovery practices, such as regular backups. Consider using the .Sy compression property as a less resource-intensive alternative. +.Ss Block cloning +Block cloning is a facility that allows a file (or parts of a file) to be +.Qq cloned , +that is, a shallow copy made where the existing data blocks are referenced +rather than copied. +Later modifications to the data will cause a copy of the data block to be taken +and that copy modified. +This facility is used to implement +.Qq reflinks +or +.Qq file-level copy-on-write . +.Pp +Cloned blocks are tracked in a special on-disk structure called the Block +Reference Table +.Po BRT +.Pc . +Unlike deduplication, this table has minimal overhead, so can be enabled at all +times. +.Pp +Also unlike deduplication, cloning must be requested by a user program. +Many common file copying programs, including newer versions of +.Nm /bin/cp , +will try to create clones automatically. +Look for +.Qq clone , +.Qq dedupe +or +.Qq reflink +in the documentation for more information. +.Pp +There are some limitations to block cloning. +Only whole blocks can be cloned, and blocks can not be cloned if they are not +yet written to disk, or if they are encrypted, or the source and destination +.Sy recordsize +properties differ. +The OS may add additional restrictions; +for example, most versions of Linux will not allow clones across datasets. diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c index ba9a95e4a66d..312d76c3e023 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c @@ -1,912 +1,890 @@ /* * Copyright (c) 2020 iXsystems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0, "ZFS adaptive replacement cache"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, brt, CTLFLAG_RW, 0, "ZFS Block Reference Table"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS condense"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS disk buf cache"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf_cache, CTLFLAG_RW, 0, "ZFS disk buf cache"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, deadman, CTLFLAG_RW, 0, "ZFS deadman"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS dedup"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, l2arc, CTLFLAG_RW, 0, "ZFS l2arc"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, livelist, CTLFLAG_RW, 0, "ZFS livelist"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, lua, CTLFLAG_RW, 0, "ZFS lua"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, mg, CTLFLAG_RW, 0, "ZFS metaslab group"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, multihost, CTLFLAG_RW, 0, "ZFS multihost protection"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, prefetch, CTLFLAG_RW, 0, "ZFS prefetch"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, reconstruct, CTLFLAG_RW, 0, "ZFS reconstruct"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, recv, CTLFLAG_RW, 0, "ZFS receive"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, send, CTLFLAG_RW, 0, "ZFS send"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, spa, CTLFLAG_RW, 0, "ZFS space allocation"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RW, 0, "ZFS TRIM"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS transaction group"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vnops, CTLFLAG_RW, 0, "ZFS VNOPS"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS event"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS livelist condense"); SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache"); SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, file, CTLFLAG_RW, 0, "ZFS VDEV file"); SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0, "ZFS VDEV mirror"); SYSCTL_DECL(_vfs_zfs_version); SYSCTL_CONST_STRING(_vfs_zfs_version, OID_AUTO, module, CTLFLAG_RD, (ZFS_META_VERSION "-" ZFS_META_RELEASE), "OpenZFS module version"); /* arc.c */ int param_set_arc_u64(SYSCTL_HANDLER_ARGS) { int err; err = sysctl_handle_64(oidp, arg1, 0, req); if (err != 0 || req->newptr == NULL) return (err); arc_tuning_update(B_TRUE); return (0); } int param_set_arc_int(SYSCTL_HANDLER_ARGS) { int err; err = sysctl_handle_int(oidp, arg1, 0, req); if (err != 0 || req->newptr == NULL) return (err); arc_tuning_update(B_TRUE); return (0); } int param_set_arc_max(SYSCTL_HANDLER_ARGS) { unsigned long val; int err; val = zfs_arc_max; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (SET_ERROR(err)); if (val != 0 && (val < MIN_ARC_MAX || val <= arc_c_min || val >= arc_all_memory())) return (SET_ERROR(EINVAL)); zfs_arc_max = val; arc_tuning_update(B_TRUE); /* Update the sysctl to the tuned value */ if (val != 0) zfs_arc_max = arc_c_max; return (0); } /* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, param_set_arc_max, "LU", "Maximum ARC size in bytes (LEGACY)"); /* END CSTYLED */ int param_set_arc_min(SYSCTL_HANDLER_ARGS) { unsigned long val; int err; val = zfs_arc_min; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (SET_ERROR(err)); if (val != 0 && (val < 2ULL << SPA_MAXBLOCKSHIFT || val > arc_c_max)) return (SET_ERROR(EINVAL)); zfs_arc_min = val; arc_tuning_update(B_TRUE); /* Update the sysctl to the tuned value */ if (val != 0) zfs_arc_min = arc_c_min; return (0); } /* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, param_set_arc_min, "LU", "Minimum ARC size in bytes (LEGACY)"); /* END CSTYLED */ extern uint_t zfs_arc_free_target; int param_set_arc_free_target(SYSCTL_HANDLER_ARGS) { uint_t val; int err; val = zfs_arc_free_target; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < minfree) return (EINVAL); if (val > vm_cnt.v_page_count) return (EINVAL); zfs_arc_free_target = val; return (0); } /* * NOTE: This sysctl is CTLFLAG_RW not CTLFLAG_RWTUN due to its dependency on * pagedaemon initialization. */ /* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, param_set_arc_free_target, "IU", "Desired number of free pages below which ARC triggers reclaim" " (LEGACY)"); /* END CSTYLED */ int param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) { int err, val; val = arc_no_grow_shift; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < 0 || val >= arc_shrink_shift) return (EINVAL); arc_no_grow_shift = val; return (0); } /* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, param_set_arc_no_grow_shift, "I", "log2(fraction of ARC which must be free to allow growing) (LEGACY)"); /* END CSTYLED */ extern uint64_t l2arc_write_max; /* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RWTUN, &l2arc_write_max, 0, "Max write bytes per interval (LEGACY)"); /* END CSTYLED */ extern uint64_t l2arc_write_boost; /* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RWTUN, &l2arc_write_boost, 0, "Extra write bytes during device warmup (LEGACY)"); /* END CSTYLED */ extern uint64_t l2arc_headroom; /* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RWTUN, &l2arc_headroom, 0, "Number of max device writes to precache (LEGACY)"); /* END CSTYLED */ extern uint64_t l2arc_headroom_boost; /* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom_boost, CTLFLAG_RWTUN, &l2arc_headroom_boost, 0, "Compressed l2arc_headroom multiplier (LEGACY)"); /* END CSTYLED */ extern uint64_t l2arc_feed_secs; /* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RWTUN, &l2arc_feed_secs, 0, "Seconds between L2ARC writing (LEGACY)"); /* END CSTYLED */ extern uint64_t l2arc_feed_min_ms; /* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RWTUN, &l2arc_feed_min_ms, 0, "Min feed interval in milliseconds (LEGACY)"); /* END CSTYLED */ extern int l2arc_noprefetch; /* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RWTUN, &l2arc_noprefetch, 0, "Skip caching prefetched buffers (LEGACY)"); /* END CSTYLED */ extern int l2arc_feed_again; /* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RWTUN, &l2arc_feed_again, 0, "Turbo L2ARC warmup (LEGACY)"); /* END CSTYLED */ extern int l2arc_norw; /* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RWTUN, &l2arc_norw, 0, "No reads during writes (LEGACY)"); /* END CSTYLED */ static int param_get_arc_state_size(SYSCTL_HANDLER_ARGS) { arc_state_t *state = (arc_state_t *)arg1; int64_t val; val = zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]) + zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]); return (sysctl_handle_64(oidp, &val, 0, req)); } extern arc_state_t ARC_anon; /* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, anon_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_anon, 0, param_get_arc_state_size, "Q", "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of evictable metadata in anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of evictable data in anonymous state"); /* END CSTYLED */ extern arc_state_t ARC_mru; /* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_mru, 0, param_get_arc_state_size, "Q", "size of mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of evictable metadata in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of evictable data in mru state"); /* END CSTYLED */ extern arc_state_t ARC_mru_ghost; /* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_mru_ghost, 0, param_get_arc_state_size, "Q", "size of mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of evictable metadata in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of evictable data in mru ghost state"); /* END CSTYLED */ extern arc_state_t ARC_mfu; /* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_mfu, 0, param_get_arc_state_size, "Q", "size of mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of evictable metadata in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of evictable data in mfu state"); /* END CSTYLED */ extern arc_state_t ARC_mfu_ghost; /* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_mfu_ghost, 0, param_get_arc_state_size, "Q", "size of mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of evictable metadata in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of evictable data in mfu ghost state"); /* END CSTYLED */ extern arc_state_t ARC_uncached; /* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, uncached_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_uncached, 0, param_get_arc_state_size, "Q", "size of uncached state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD, &ARC_uncached.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of evictable metadata in uncached state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD, &ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of evictable data in uncached state"); /* END CSTYLED */ extern arc_state_t ARC_l2c_only; /* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, l2c_only_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_l2c_only, 0, param_get_arc_state_size, "Q", "size of l2c_only state"); /* END CSTYLED */ /* dbuf.c */ /* dmu.c */ /* dmu_zfetch.c */ SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)"); extern uint32_t zfetch_max_distance; /* BEGIN CSTYLED */ SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN, &zfetch_max_distance, 0, "Max bytes to prefetch per stream (LEGACY)"); /* END CSTYLED */ extern uint32_t zfetch_max_idistance; /* BEGIN CSTYLED */ SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN, &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream (LEGACY)"); /* END CSTYLED */ /* dsl_pool.c */ /* dnode.c */ /* dsl_scan.c */ /* metaslab.c */ int param_set_active_allocator(SYSCTL_HANDLER_ARGS) { char buf[16]; int rc; if (req->newptr == NULL) strlcpy(buf, zfs_active_allocator, sizeof (buf)); rc = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (rc || req->newptr == NULL) return (rc); if (strcmp(buf, zfs_active_allocator) == 0) return (0); return (param_set_active_allocator_common(buf)); } /* * In pools where the log space map feature is not enabled we touch * multiple metaslabs (and their respective space maps) with each * transaction group. Thus, we benefit from having a small space map * block size since it allows us to issue more I/O operations scattered * around the disk. So a sane default for the space map block size * is 8~16K. */ extern int zfs_metaslab_sm_blksz_no_log; /* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log, CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz_no_log, 0, "Block size for space map in pools with log space map disabled. " "Power of 2 greater than 4096."); /* END CSTYLED */ /* * When the log space map feature is enabled, we accumulate a lot of * changes per metaslab that are flushed once in a while so we benefit * from a bigger block size like 128K for the metaslab space maps. */ extern int zfs_metaslab_sm_blksz_with_log; /* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log, CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz_with_log, 0, "Block size for space map in pools with log space map enabled. " "Power of 2 greater than 4096."); /* END CSTYLED */ /* * The in-core space map representation is more compact than its on-disk form. * The zfs_condense_pct determines how much more compact the in-core * space map representation must be before we compact it on-disk. * Values should be greater than or equal to 100. */ extern uint_t zfs_condense_pct; /* BEGIN CSTYLED */ SYSCTL_UINT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, &zfs_condense_pct, 0, "Condense on-disk spacemap when it is more than this many percents" " of in-memory counterpart"); /* END CSTYLED */ extern uint_t zfs_remove_max_segment; /* BEGIN CSTYLED */ SYSCTL_UINT(_vfs_zfs, OID_AUTO, remove_max_segment, CTLFLAG_RWTUN, &zfs_remove_max_segment, 0, "Largest contiguous segment ZFS will attempt to allocate when removing" " a device"); /* END CSTYLED */ extern int zfs_removal_suspend_progress; /* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, CTLFLAG_RWTUN, &zfs_removal_suspend_progress, 0, "Ensures certain actions can happen while in the middle of a removal"); /* END CSTYLED */ /* * Minimum size which forces the dynamic allocator to change * it's allocation strategy. Once the space map cannot satisfy * an allocation of this size then it switches to using more * aggressive strategy (i.e search by size rather than offset). */ extern uint64_t metaslab_df_alloc_threshold; /* BEGIN CSTYLED */ SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, &metaslab_df_alloc_threshold, 0, "Minimum size which forces the dynamic allocator to change its" " allocation strategy"); /* END CSTYLED */ /* * The minimum free space, in percent, which must be available * in a space map to continue allocations in a first-fit fashion. * Once the space map's free space drops below this level we dynamically * switch to using best-fit allocations. */ extern uint_t metaslab_df_free_pct; /* BEGIN CSTYLED */ SYSCTL_UINT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, &metaslab_df_free_pct, 0, "The minimum free space, in percent, which must be available in a" " space map to continue allocations in a first-fit fashion"); /* END CSTYLED */ -/* - * Percentage of all cpus that can be used by the metaslab taskq. - */ -extern int metaslab_load_pct; - -/* BEGIN CSTYLED */ -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, - CTLFLAG_RWTUN, &metaslab_load_pct, 0, - "Percentage of cpus that can be used by the metaslab taskq"); -/* END CSTYLED */ - -/* - * Max number of metaslabs per group to preload. - */ -extern uint_t metaslab_preload_limit; - -/* BEGIN CSTYLED */ -SYSCTL_UINT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, - CTLFLAG_RWTUN, &metaslab_preload_limit, 0, - "Max number of metaslabs per group to preload"); -/* END CSTYLED */ - /* mmp.c */ int param_set_multihost_interval(SYSCTL_HANDLER_ARGS) { int err; err = sysctl_handle_64(oidp, &zfs_multihost_interval, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (spa_mode_global != SPA_MODE_UNINIT) mmp_signal_all_threads(); return (0); } /* spa.c */ extern int zfs_ccw_retry_interval; /* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RWTUN, &zfs_ccw_retry_interval, 0, "Configuration cache file write, retry after failure, interval" " (seconds)"); /* END CSTYLED */ extern uint64_t zfs_max_missing_tvds_cachefile; /* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN, &zfs_max_missing_tvds_cachefile, 0, "Allow importing pools with missing top-level vdevs in cache file"); /* END CSTYLED */ extern uint64_t zfs_max_missing_tvds_scan; /* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN, &zfs_max_missing_tvds_scan, 0, "Allow importing pools with missing top-level vdevs during scan"); /* END CSTYLED */ /* spa_misc.c */ extern int zfs_flags; static int sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS) { int err, val; val = zfs_flags; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); /* * ZFS_DEBUG_MODIFY must be enabled prior to boot so all * arc buffers in the system have the necessary additional * checksum data. However, it is safe to disable at any * time. */ if (!(zfs_flags & ZFS_DEBUG_MODIFY)) val &= ~ZFS_DEBUG_MODIFY; zfs_flags = val; return (0); } /* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, NULL, 0, sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing."); /* END CSTYLED */ int param_set_deadman_synctime(SYSCTL_HANDLER_ARGS) { unsigned long val; int err; val = zfs_deadman_synctime_ms; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); zfs_deadman_synctime_ms = val; spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms)); return (0); } int param_set_deadman_ziotime(SYSCTL_HANDLER_ARGS) { unsigned long val; int err; val = zfs_deadman_ziotime_ms; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); zfs_deadman_ziotime_ms = val; spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_synctime_ms)); return (0); } int param_set_deadman_failmode(SYSCTL_HANDLER_ARGS) { char buf[16]; int rc; if (req->newptr == NULL) strlcpy(buf, zfs_deadman_failmode, sizeof (buf)); rc = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (rc || req->newptr == NULL) return (rc); if (strcmp(buf, zfs_deadman_failmode) == 0) return (0); if (strcmp(buf, "wait") == 0) zfs_deadman_failmode = "wait"; if (strcmp(buf, "continue") == 0) zfs_deadman_failmode = "continue"; if (strcmp(buf, "panic") == 0) zfs_deadman_failmode = "panic"; return (-param_set_deadman_failmode_common(buf)); } int param_set_slop_shift(SYSCTL_HANDLER_ARGS) { int val; int err; val = spa_slop_shift; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < 1 || val > 31) return (EINVAL); spa_slop_shift = val; return (0); } /* spacemap.c */ extern int space_map_ibs; /* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN, &space_map_ibs, 0, "Space map indirect block shift"); /* END CSTYLED */ /* vdev.c */ int param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS) { int val; int err; val = zfs_vdev_min_auto_ashift; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (SET_ERROR(err)); if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) return (SET_ERROR(EINVAL)); zfs_vdev_min_auto_ashift = val; return (0); } /* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, &zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift), param_set_min_auto_ashift, "IU", "Min ashift used when creating new top-level vdev. (LEGACY)"); /* END CSTYLED */ int param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) { int val; int err; val = zfs_vdev_max_auto_ashift; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (SET_ERROR(err)); if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) return (SET_ERROR(EINVAL)); zfs_vdev_max_auto_ashift = val; return (0); } /* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, &zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift), param_set_max_auto_ashift, "IU", "Max ashift used when optimizing for logical -> physical sector size on" " new top-level vdevs. (LEGACY)"); /* END CSTYLED */ /* * Since the DTL space map of a vdev is not expected to have a lot of * entries, we default its block size to 4K. */ extern int zfs_vdev_dtl_sm_blksz; /* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN, &zfs_vdev_dtl_sm_blksz, 0, "Block size for DTL space map. Power of 2 greater than 4096."); /* END CSTYLED */ /* * vdev-wide space maps that have lots of entries written to them at * the end of each transaction can benefit from a higher I/O bandwidth * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. */ extern int zfs_vdev_standard_sm_blksz; /* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN, &zfs_vdev_standard_sm_blksz, 0, "Block size for standard space map. Power of 2 greater than 4096."); /* END CSTYLED */ extern int vdev_validate_skip; /* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, CTLFLAG_RDTUN, &vdev_validate_skip, 0, "Enable to bypass vdev_validate()."); /* END CSTYLED */ /* vdev_mirror.c */ /* vdev_queue.c */ extern uint_t zfs_vdev_max_active; /* BEGIN CSTYLED */ SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN, &zfs_vdev_max_active, 0, "The maximum number of I/Os of all types active for each device." " (LEGACY)"); /* END CSTYLED */ /* zio.c */ /* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, "Exclude metadata buffers from dumps as well"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index 22dc0ed5e3b6..b5946e7604c0 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -1,10914 +1,10722 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. * Copyright (c) 2011, 2020, Delphix. All rights reserved. * Copyright (c) 2014, Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. * Copyright (c) 2020, George Amanakis. All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2020, The FreeBSD Foundation [1] * * [1] Portions of this software were developed by Allan Jude * under sponsorship from the FreeBSD Foundation. */ /* * DVA-based Adjustable Replacement Cache * * While much of the theory of operation used here is * based on the self-tuning, low overhead replacement cache * presented by Megiddo and Modha at FAST 2003, there are some * significant differences: * * 1. The Megiddo and Modha model assumes any page is evictable. * Pages in its cache cannot be "locked" into memory. This makes * the eviction algorithm simple: evict the last page in the list. * This also make the performance characteristics easy to reason * about. Our cache is not so simple. At any given moment, some * subset of the blocks in the cache are un-evictable because we * have handed out a reference to them. Blocks are only evictable * when there are no external references active. This makes * eviction far more problematic: we choose to evict the evictable * blocks that are the "lowest" in the list. * * There are times when it is not possible to evict the requested * space. In these circumstances we are unable to adjust the cache * size. To prevent the cache growing unbounded at these times we * implement a "cache throttle" that slows the flow of new data * into the cache until we can make space available. * * 2. The Megiddo and Modha model assumes a fixed cache size. * Pages are evicted when the cache is full and there is a cache * miss. Our model has a variable sized cache. It grows with * high use, but also tries to react to memory pressure from the * operating system: decreasing its size when system memory is * tight. * * 3. The Megiddo and Modha model assumes a fixed page size. All * elements of the cache are therefore exactly the same size. So * when adjusting the cache size following a cache miss, its simply * a matter of choosing a single page to evict. In our model, we * have variable sized cache blocks (ranging from 512 bytes to * 128K bytes). We therefore choose a set of blocks to evict to make * space for a cache miss that approximates as closely as possible * the space used by the new block. * * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" * by N. Megiddo & D. Modha, FAST 2003 */ /* * The locking model: * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal ARC algorithms for * adjusting the cache use method 2. We therefore provide two * types of locks: 1) the hash table lock array, and 2) the * ARC list locks. * * Buffers do not have their own mutexes, rather they rely on the * hash table mutexes for the bulk of their protection (i.e. most * fields in the arc_buf_hdr_t are protected by these mutexes). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns * NULL for the mutex if the buffer was not in the table. * * buf_hash_remove() expects the appropriate hash mutex to be * already held before it is invoked. * * Each ARC state also has a mutex which is used to protect the * buffer list associated with the state. When attempting to * obtain a hash table lock while holding an ARC list lock you * must use: mutex_tryenter() to avoid deadlock. Also note that * the active state mutex must be held before the ghost state mutex. * * It as also possible to register a callback which is run when the * metadata limit is reached and no buffers can be safely evicted. In * this case the arc user should drop a reference on some arc buffers so * they can be reclaimed. For example, when using the ZPL each dentry * holds a references on a znode. These dentries must be pruned before * the arc buffer holding the znode can be safely evicted. * * Note that the majority of the performance stats are manipulated * with atomic operations. * * The L2ARC uses the l2ad_mtx on each vdev for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction * - L2ARC write completion, which walks L2ARC buflists * - ARC header destruction, as it removes from L2ARC buflists * - ARC header release, as it removes from L2ARC buflists */ /* * ARC operation: * * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. * This structure can point either to a block that is still in the cache or to * one that is only accessible in an L2 ARC device, or it can provide * information about a block that was recently evicted. If a block is * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough * information to retrieve it from the L2ARC device. This information is * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block * that is in this state cannot access the data directly. * * Blocks that are actively being referenced or have not been evicted * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within * the arc_buf_hdr_t that will point to the data block in memory. A block can * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). * * The L1ARC's data pointer may or may not be uncompressed. The ARC has the * ability to store the physical data (b_pabd) associated with the DVA of the * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, * it will match its on-disk compression characteristics. This behavior can be * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the * compressed ARC functionality is disabled, the b_pabd will point to an * uncompressed version of the on-disk data. * * Data in the L1ARC is not accessed by consumers of the ARC directly. Each * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC * consumer. The ARC will provide references to this data and will keep it * cached until it is no longer in use. The ARC caches only the L1ARC's physical * data block and will evict any arc_buf_t that is no longer referenced. The * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the * "overhead_size" kstat. * * Depending on the consumer, an arc_buf_t can be requested in uncompressed or * compressed form. The typical case is that consumers will want uncompressed * data, and when that happens a new data buffer is allocated where the data is * decompressed for them to use. Currently the only consumer who wants * compressed arc_buf_t's is "zfs send", when it streams data exactly as it * exists on disk. When this happens, the arc_buf_t's data buffer is shared * with the arc_buf_hdr_t. * * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The * first one is owned by a compressed send consumer (and therefore references * the same compressed data buffer as the arc_buf_hdr_t) and the second could be * used by any other consumer (and has its own uncompressed copy of the data * buffer). * * arc_buf_hdr_t * +-----------+ * | fields | * | common to | * | L1- and | * | L2ARC | * +-----------+ * | l2arc_buf_hdr_t * | | * +-----------+ * | l1arc_buf_hdr_t * | | arc_buf_t * | b_buf +------------>+-----------+ arc_buf_t * | b_pabd +-+ |b_next +---->+-----------+ * +-----------+ | |-----------| |b_next +-->NULL * | |b_comp = T | +-----------+ * | |b_data +-+ |b_comp = F | * | +-----------+ | |b_data +-+ * +->+------+ | +-----------+ | * compressed | | | | * data | |<--------------+ | uncompressed * +------+ compressed, | data * shared +-->+------+ * data | | * | | * +------+ * * When a consumer reads a block, the ARC must first look to see if the * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new * arc_buf_t and either copies uncompressed data into a new data buffer from an * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the * hdr is compressed and the desired compression characteristics of the * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be * the last buffer in the hdr's b_buf list, however a shared compressed buf can * be anywhere in the hdr's list. * * The diagram below shows an example of an uncompressed ARC hdr that is * sharing its data with an arc_buf_t (note that the shared uncompressed buf is * the last element in the buf list): * * arc_buf_hdr_t * +-----------+ * | | * | | * | | * +-----------+ * l2arc_buf_hdr_t| | * | | * +-----------+ * l1arc_buf_hdr_t| | * | | arc_buf_t (shared) * | b_buf +------------>+---------+ arc_buf_t * | | |b_next +---->+---------+ * | b_pabd +-+ |---------| |b_next +-->NULL * +-----------+ | | | +---------+ * | |b_data +-+ | | * | +---------+ | |b_data +-+ * +->+------+ | +---------+ | * | | | | * uncompressed | | | | * data +------+ | | * ^ +->+------+ | * | uncompressed | | | * | data | | | * | +------+ | * +---------------------------------+ * * Writing to the ARC requires that the ARC first discard the hdr's b_pabd * since the physical block is about to be rewritten. The new data contents * will be contained in the arc_buf_t. As the I/O pipeline performs the write, * it may compress the data before writing it to disk. The ARC will be called * with the transformed data and will memcpy the transformed on-disk block into * a newly allocated b_pabd. Writes are always done into buffers which have * either been loaned (and hence are new and don't have other readers) or * buffers which have been released (and hence have their own hdr, if there * were originally other readers of the buf's original hdr). This ensures that * the ARC only needs to update a single buf and its hdr after a write occurs. * * When the L2ARC is in use, it will also take advantage of the b_pabd. The * L2ARC will always write the contents of b_pabd to the L2ARC. This means * that when compressed ARC is enabled that the L2ARC blocks are identical * to the on-disk block in the main data pool. This provides a significant * advantage since the ARC can leverage the bp's checksum when reading from the * L2ARC to determine if the contents are valid. However, if the compressed * ARC is disabled, then the L2ARC's block must be transformed to look * like the physical block in the main data pool before comparing the * checksum and determining its validity. * * The L1ARC has a slightly different system for storing encrypted data. * Raw (encrypted + possibly compressed) data has a few subtle differences from * data that is just compressed. The biggest difference is that it is not * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded. * The other difference is that encryption cannot be treated as a suggestion. * If a caller would prefer compressed data, but they actually wind up with * uncompressed data the worst thing that could happen is there might be a * performance hit. If the caller requests encrypted data, however, we must be * sure they actually get it or else secret information could be leaked. Raw * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore, * may have both an encrypted version and a decrypted version of its data at * once. When a caller needs a raw arc_buf_t, it is allocated and the data is * copied out of this header. To avoid complications with b_pabd, raw buffers * cannot be shared. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ boolean_t arc_watch = B_FALSE; #endif /* * This thread's job is to keep enough free memory in the system, by * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves * arc_available_memory(). */ static zthr_t *arc_reap_zthr; /* * This thread's job is to keep arc_size under arc_c, by calling * arc_evict(), which improves arc_is_overflowing(). */ static zthr_t *arc_evict_zthr; static arc_buf_hdr_t **arc_state_evict_markers; static int arc_state_evict_marker_count; static kmutex_t arc_evict_lock; static boolean_t arc_evict_needed = B_FALSE; static clock_t arc_last_uncached_flush; /* * Count of bytes evicted since boot. */ static uint64_t arc_evict_count; /* * List of arc_evict_waiter_t's, representing threads waiting for the * arc_evict_count to reach specific values. */ static list_t arc_evict_waiters; /* * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of * the requested amount of data to be evicted. For example, by default for * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation. * Since this is above 100%, it ensures that progress is made towards getting * arc_size under arc_c. Since this is finite, it ensures that allocations * can still happen, even during the potentially long time that arc_size is * more than arc_c. */ static uint_t zfs_arc_eviction_pct = 200; /* * The number of headers to evict in arc_evict_state_impl() before * dropping the sublist lock and evicting from another sublist. A lower * value means we're more likely to evict the "correct" header (i.e. the * oldest header in the arc state), but comes with higher overhead * (i.e. more invocations of arc_evict_state_impl()). */ static uint_t zfs_arc_evict_batch_limit = 10; /* number of seconds before growing cache again */ uint_t arc_grow_retry = 5; /* * Minimum time between calls to arc_kmem_reap_soon(). */ static const int arc_kmem_cache_reap_retry_ms = 1000; /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ static int zfs_arc_overflow_shift = 8; /* log2(fraction of arc to reclaim) */ uint_t arc_shrink_shift = 7; /* percent of pagecache to reclaim arc to */ #ifdef _KERNEL uint_t zfs_arc_pc_percent = 0; #endif /* * log2(fraction of ARC which must be free to allow growing). * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, * when reading a new block into the ARC, we will evict an equal-sized block * from the ARC. * * This must be less than arc_shrink_shift, so that when we shrink the ARC, * we will still not allow it to grow. */ uint_t arc_no_grow_shift = 5; /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ static uint_t arc_min_prefetch_ms; static uint_t arc_min_prescient_prefetch_ms; /* * If this percent of memory is free, don't throttle. */ uint_t arc_lotsfree_percent = 10; /* * The arc has filled available memory and has now warmed up. */ boolean_t arc_warm; /* * These tunables are for performance analysis. */ uint64_t zfs_arc_max = 0; uint64_t zfs_arc_min = 0; static uint64_t zfs_arc_dnode_limit = 0; static uint_t zfs_arc_dnode_reduce_percent = 10; static uint_t zfs_arc_grow_retry = 0; static uint_t zfs_arc_shrink_shift = 0; uint_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ /* * ARC dirty data constraints for arc_tempreserve_space() throttle: * * total dirty data limit * * anon block dirty limit * * each pool's anon allowance */ static const unsigned long zfs_arc_dirty_limit_percent = 50; static const unsigned long zfs_arc_anon_limit_percent = 25; static const unsigned long zfs_arc_pool_dirty_percent = 20; /* * Enable or disable compressed arc buffers. */ int zfs_compressed_arc_enabled = B_TRUE; /* * Balance between metadata and data on ghost hits. Values above 100 * increase metadata caching by proportionally reducing effect of ghost * data hits on target data/metadata rate. */ static uint_t zfs_arc_meta_balance = 500; /* * Percentage that can be consumed by dnodes of ARC meta buffers. */ static uint_t zfs_arc_dnode_limit_percent = 10; /* * These tunables are Linux-specific */ static uint64_t zfs_arc_sys_free = 0; static uint_t zfs_arc_min_prefetch_ms = 0; static uint_t zfs_arc_min_prescient_prefetch_ms = 0; static uint_t zfs_arc_lotsfree_percent = 10; /* * Number of arc_prune threads */ static int zfs_arc_prune_task_threads = 1; /* The 7 states: */ arc_state_t ARC_anon; arc_state_t ARC_mru; arc_state_t ARC_mru_ghost; arc_state_t ARC_mfu; arc_state_t ARC_mfu_ghost; arc_state_t ARC_l2c_only; arc_state_t ARC_uncached; arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, { "iohits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, { "demand_data_iohits", KSTAT_DATA_UINT64 }, { "demand_data_misses", KSTAT_DATA_UINT64 }, { "demand_metadata_hits", KSTAT_DATA_UINT64 }, { "demand_metadata_iohits", KSTAT_DATA_UINT64 }, { "demand_metadata_misses", KSTAT_DATA_UINT64 }, { "prefetch_data_hits", KSTAT_DATA_UINT64 }, { "prefetch_data_iohits", KSTAT_DATA_UINT64 }, { "prefetch_data_misses", KSTAT_DATA_UINT64 }, { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_iohits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, { "mru_hits", KSTAT_DATA_UINT64 }, { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, { "uncached_hits", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "access_skip", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_eligible_mfu", KSTAT_DATA_UINT64 }, { "evict_l2_eligible_mru", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "meta", KSTAT_DATA_UINT64 }, { "pd", KSTAT_DATA_UINT64 }, { "pm", KSTAT_DATA_UINT64 }, { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "compressed_size", KSTAT_DATA_UINT64 }, { "uncompressed_size", KSTAT_DATA_UINT64 }, { "overhead_size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "metadata_size", KSTAT_DATA_UINT64 }, { "dbuf_size", KSTAT_DATA_UINT64 }, { "dnode_size", KSTAT_DATA_UINT64 }, { "bonus_size", KSTAT_DATA_UINT64 }, #if defined(COMPAT_FREEBSD11) { "other_size", KSTAT_DATA_UINT64 }, #endif { "anon_size", KSTAT_DATA_UINT64 }, { "anon_data", KSTAT_DATA_UINT64 }, { "anon_metadata", KSTAT_DATA_UINT64 }, { "anon_evictable_data", KSTAT_DATA_UINT64 }, { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_size", KSTAT_DATA_UINT64 }, { "mru_data", KSTAT_DATA_UINT64 }, { "mru_metadata", KSTAT_DATA_UINT64 }, { "mru_evictable_data", KSTAT_DATA_UINT64 }, { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_size", KSTAT_DATA_UINT64 }, { "mru_ghost_data", KSTAT_DATA_UINT64 }, { "mru_ghost_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_size", KSTAT_DATA_UINT64 }, { "mfu_data", KSTAT_DATA_UINT64 }, { "mfu_metadata", KSTAT_DATA_UINT64 }, { "mfu_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_size", KSTAT_DATA_UINT64 }, { "mfu_ghost_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "uncached_size", KSTAT_DATA_UINT64 }, { "uncached_data", KSTAT_DATA_UINT64 }, { "uncached_metadata", KSTAT_DATA_UINT64 }, { "uncached_evictable_data", KSTAT_DATA_UINT64 }, { "uncached_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_prefetch_asize", KSTAT_DATA_UINT64 }, { "l2_mru_asize", KSTAT_DATA_UINT64 }, { "l2_mfu_asize", KSTAT_DATA_UINT64 }, { "l2_bufc_data_asize", KSTAT_DATA_UINT64 }, { "l2_bufc_metadata_asize", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 }, { "l2_write_bytes", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "l2_log_blk_writes", KSTAT_DATA_UINT64 }, { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 }, { "l2_log_blk_asize", KSTAT_DATA_UINT64 }, { "l2_log_blk_count", KSTAT_DATA_UINT64 }, { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 }, { "l2_rebuild_success", KSTAT_DATA_UINT64 }, { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 }, { "l2_rebuild_size", KSTAT_DATA_UINT64 }, { "l2_rebuild_asize", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 }, { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "memory_direct_count", KSTAT_DATA_UINT64 }, { "memory_indirect_count", KSTAT_DATA_UINT64 }, { "memory_all_bytes", KSTAT_DATA_UINT64 }, { "memory_free_bytes", KSTAT_DATA_UINT64 }, { "memory_available_bytes", KSTAT_DATA_INT64 }, { "arc_no_grow", KSTAT_DATA_UINT64 }, { "arc_tempreserve", KSTAT_DATA_UINT64 }, { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, { "arc_prune", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_dnode_limit", KSTAT_DATA_UINT64 }, { "async_upgrade_sync", KSTAT_DATA_UINT64 }, { "predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_iohit_predictive_prefetch", KSTAT_DATA_UINT64 }, { "prescient_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, { "demand_iohit_prescient_prefetch", KSTAT_DATA_UINT64 }, { "arc_need_free", KSTAT_DATA_UINT64 }, { "arc_sys_free", KSTAT_DATA_UINT64 }, { "arc_raw_size", KSTAT_DATA_UINT64 }, { "cached_only_in_progress", KSTAT_DATA_UINT64 }, { "abd_chunk_waste_size", KSTAT_DATA_UINT64 }, }; arc_sums_t arc_sums; #define ARCSTAT_MAX(stat, val) { \ uint64_t m; \ while ((val) > (m = arc_stats.stat.value.ui64) && \ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ continue; \ } /* * We define a macro to allow ARC hits/misses to be easily broken down by * two separate conditions, giving a total of four different subtypes for * each of hits and misses (so eight statistics total). */ #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ if (cond1) { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ } \ } else { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ } \ } /* * This macro allows us to use kstats as floating averages. Each time we * update this kstat, we first factor it and the update value by * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall * average. This macro assumes that integer loads and stores are atomic, but * is not safe for multiple writers updating the kstat in parallel (only the * last writer's update will remain). */ #define ARCSTAT_F_AVG_FACTOR 3 #define ARCSTAT_F_AVG(stat, value) \ do { \ uint64_t x = ARCSTAT(stat); \ x = x - x / ARCSTAT_F_AVG_FACTOR + \ (value) / ARCSTAT_F_AVG_FACTOR; \ ARCSTAT(stat) = x; \ } while (0) static kstat_t *arc_ksp; /* * There are several ARC variables that are critical to export as kstats -- * but we don't want to have to grovel around in the kstat whenever we wish to * manipulate them. For these variables, we therefore define them to be in * terms of the statistic variable. This assures that we are not introducing * the possibility of inconsistency by having shadow copies of the variables, * while still allowing the code to be readable. */ #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) #define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */ #define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */ hrtime_t arc_growtime; list_t arc_prune_list; kmutex_t arc_prune_mtx; taskq_t *arc_prune_taskq; #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) #define HDR_PRESCIENT_PREFETCH(hdr) \ ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) #define HDR_COMPRESSION_ENABLED(hdr) \ ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) #define HDR_UNCACHED(hdr) ((hdr)->b_flags & ARC_FLAG_UNCACHED) #define HDR_L2_READING(hdr) \ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) #define HDR_PROTECTED(hdr) ((hdr)->b_flags & ARC_FLAG_PROTECTED) #define HDR_NOAUTH(hdr) ((hdr)->b_flags & ARC_FLAG_NOAUTH) #define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) #define HDR_ISTYPE_METADATA(hdr) \ ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) #define HDR_HAS_RABD(hdr) \ (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) && \ (hdr)->b_crypt_hdr.b_rabd != NULL) #define HDR_ENCRYPTED(hdr) \ (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) #define HDR_AUTHENTICATED(hdr) \ (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) /* For storing compression mode in b_flags */ #define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) #define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) #define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) #define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) #define ARC_BUF_ENCRYPTED(buf) ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED) /* * Other sizes */ -#define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) -#define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr)) +#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* * Hash table routines */ #define BUF_LOCKS 2048 typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; kmutex_t ht_locks[BUF_LOCKS] ____cacheline_aligned; } buf_hash_table_t; static buf_hash_table_t buf_hash_table; #define BUF_HASH_INDEX(spa, dva, birth) \ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define HDR_LOCK(hdr) \ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; /* * Level 2 ARC */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ #define L2ARC_HEADROOM 2 /* num of writes */ /* * If we discover during ARC scan any buffers to be compressed, we boost * our headroom for the next scanning cycle by this percentage multiple. */ #define L2ARC_HEADROOM_BOOST 200 #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ /* * We can feed L2ARC from two states of ARC buffers, mru and mfu, * and each of the state has two types: data and metadata. */ #define L2ARC_FEED_TYPES 4 /* L2ARC Performance Tunables */ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ int l2arc_feed_again = B_TRUE; /* turbo warmup */ int l2arc_norw = B_FALSE; /* no reads during writes */ static uint_t l2arc_meta_percent = 33; /* limit on headers size */ /* * L2ARC Internals */ static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { arc_buf_hdr_t *l2rcb_hdr; /* read header */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ abd_t *l2rcb_abd; /* temporary buffer */ } l2arc_read_callback_t; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ abd_t *l2df_abd; size_t l2df_size; arc_buf_contents_t l2df_type; list_node_t l2df_list_node; } l2arc_data_free_t; typedef enum arc_fill_flags { ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */ ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */ ARC_FILL_ENCRYPTED = 1 << 2, /* fill with encrypted data */ ARC_FILL_NOAUTH = 1 << 3, /* don't attempt to authenticate */ ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */ } arc_fill_flags_t; typedef enum arc_ovf_level { ARC_OVF_NONE, /* ARC within target size. */ ARC_OVF_SOME, /* ARC is slightly overflowed. */ ARC_OVF_SEVERE /* ARC is severely overflowed. */ } arc_ovf_level_t; static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; static kmutex_t l2arc_rebuild_thr_lock; static kcondvar_t l2arc_rebuild_thr_cv; enum arc_hdr_alloc_flags { ARC_HDR_ALLOC_RDATA = 0x1, ARC_HDR_USE_RESERVE = 0x4, ARC_HDR_ALLOC_LINEAR = 0x8, }; static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, const void *, int); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, const void *); static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, const void *, int); static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, const void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, const void *); static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag); static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t); static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int); static void arc_hdr_destroy(arc_buf_hdr_t *); static void arc_access(arc_buf_hdr_t *, arc_flags_t, boolean_t); static void arc_buf_watch(arc_buf_t *); static void arc_change_state(arc_state_t *, arc_buf_hdr_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); static void l2arc_do_free_on_write(void); static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, boolean_t state_only); #define l2arc_hdr_arcstats_increment(hdr) \ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE) #define l2arc_hdr_arcstats_decrement(hdr) \ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE) #define l2arc_hdr_arcstats_increment_state(hdr) \ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE) #define l2arc_hdr_arcstats_decrement_state(hdr) \ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE) /* * l2arc_exclude_special : A zfs module parameter that controls whether buffers * present on special vdevs are eligibile for caching in L2ARC. If * set to 1, exclude dbufs on special vdevs from being cached to * L2ARC. */ int l2arc_exclude_special = 0; /* * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU * metadata and data are cached from ARC into L2ARC. */ static int l2arc_mfuonly = 0; /* * L2ARC TRIM * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of * the current write size (l2arc_write_max) we should TRIM if we * have filled the device. It is defined as a percentage of the * write size. If set to 100 we trim twice the space required to * accommodate upcoming writes. A minimum of 64MB will be trimmed. * It also enables TRIM of the whole L2ARC device upon creation or * addition to an existing pool or if the header of the device is * invalid upon importing a pool or onlining a cache device. The * default is 0, which disables TRIM on L2ARC altogether as it can * put significant stress on the underlying storage devices. This * will vary depending of how well the specific device handles * these commands. */ static uint64_t l2arc_trim_ahead = 0; /* * Performance tuning of L2ARC persistence: * * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding * an L2ARC device (either at pool import or later) will attempt * to rebuild L2ARC buffer contents. * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls * whether log blocks are written to the L2ARC device. If the L2ARC * device is less than 1GB, the amount of data l2arc_evict() * evicts is significant compared to the amount of restored L2ARC * data. In this case do not write log blocks in L2ARC in order * not to waste space. */ static int l2arc_rebuild_enabled = B_TRUE; static uint64_t l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024; /* L2ARC persistence rebuild control routines. */ void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg); static int l2arc_rebuild(l2arc_dev_t *dev); /* L2ARC persistence read I/O routines. */ static int l2arc_dev_hdr_read(l2arc_dev_t *dev); static int l2arc_log_blk_read(l2arc_dev_t *dev, const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp, l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, zio_t *this_io, zio_t **next_io); static zio_t *l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb); static void l2arc_log_blk_fetch_abort(zio_t *zio); /* L2ARC persistence block restoration routines. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, uint64_t lb_asize); static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); /* L2ARC persistence write I/O routines. */ static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb); /* L2ARC persistence auxiliary routines. */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp); static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab); boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check); static void l2arc_blk_fetch_done(zio_t *zio); static inline uint64_t l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev); /* * We use Cityhash for this. It's fast, and has good hash properties without * requiring any large static buffers. */ static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth)); } #define HDR_EMPTY(hdr) \ ((hdr)->b_dva.dva_word[0] == 0 && \ (hdr)->b_dva.dva_word[1] == 0) #define HDR_EMPTY_OR_LOCKED(hdr) \ (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr))) #define HDR_EQUAL(spa, dva, birth, hdr) \ ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) { hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; } static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { const dva_t *dva = BP_IDENTITY(bp); uint64_t birth = BP_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *hdr; mutex_enter(hash_lock); for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; hdr = hdr->b_hash_next) { if (HDR_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; return (hdr); } } mutex_exit(hash_lock); *lockp = NULL; return (NULL); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. * If lockp == NULL, the caller is assumed to already hold the hash lock. */ static arc_buf_hdr_t * buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *fhdr; uint32_t i; ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); ASSERT(hdr->b_birth != 0); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (lockp != NULL) { *lockp = hash_lock; mutex_enter(hash_lock); } else { ASSERT(MUTEX_HELD(hash_lock)); } for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) return (fhdr); } hdr->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = hdr; arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ if (i > 0) { ARCSTAT_BUMP(arcstat_hash_collisions); if (i == 1) ARCSTAT_BUMP(arcstat_hash_chains); ARCSTAT_MAX(arcstat_hash_chain_max, i); } uint64_t he = atomic_inc_64_nv( &arc_stats.arcstat_hash_elements.value.ui64); ARCSTAT_MAX(arcstat_hash_elements_max, he); return (NULL); } static void buf_hash_remove(arc_buf_hdr_t *hdr) { arc_buf_hdr_t *fhdr, **hdrp; uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdrp = &buf_hash_table.ht_table[idx]; while ((fhdr = *hdrp) != hdr) { ASSERT3P(fhdr, !=, NULL); hdrp = &fhdr->b_hash_next; } *hdrp = hdr->b_hash_next; hdr->b_hash_next = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ atomic_dec_64(&arc_stats.arcstat_hash_elements.value.ui64); if (buf_hash_table.ht_table[idx] && buf_hash_table.ht_table[idx]->b_hash_next == NULL) ARCSTAT_BUMPDOWN(arcstat_hash_chains); } /* * Global data structures and functions for the buf kmem cache. */ static kmem_cache_t *hdr_full_cache; -static kmem_cache_t *hdr_full_crypt_cache; static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; static void buf_fini(void) { #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages * should be using vmem_free() in the linux kernel\ */ vmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); #else kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); #endif for (int i = 0; i < BUF_LOCKS; i++) mutex_destroy(BUF_HASH_LOCK(i)); kmem_cache_destroy(hdr_full_cache); - kmem_cache_destroy(hdr_full_crypt_cache); kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } /* * Constructor callback - called when the cache is empty * and a new buf is requested. */ static int hdr_full_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; arc_buf_hdr_t *hdr = vbuf; memset(hdr, 0, HDR_FULL_SIZE); hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); #ifdef ZFS_DEBUG mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); #endif multilist_link_init(&hdr->b_l1hdr.b_arc_node); list_link_init(&hdr->b_l2hdr.b_l2node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); } -static int -hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag) -{ - (void) unused; - arc_buf_hdr_t *hdr = vbuf; - - hdr_full_cons(vbuf, unused, kmflag); - memset(&hdr->b_crypt_hdr, 0, sizeof (hdr->b_crypt_hdr)); - arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); - - return (0); -} - static int hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; arc_buf_hdr_t *hdr = vbuf; memset(hdr, 0, HDR_L2ONLY_SIZE); arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } static int buf_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; arc_buf_t *buf = vbuf; memset(buf, 0, sizeof (arc_buf_t)); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); } /* * Destructor callback - called when a cached buf is * no longer required. */ static void hdr_full_dest(void *vbuf, void *unused) { (void) unused; arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); - cv_destroy(&hdr->b_l1hdr.b_cv); zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt); #ifdef ZFS_DEBUG mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); #endif ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); } -static void -hdr_full_crypt_dest(void *vbuf, void *unused) -{ - (void) vbuf, (void) unused; - - hdr_full_dest(vbuf, unused); - arc_space_return(sizeof (((arc_buf_hdr_t *)NULL)->b_crypt_hdr), - ARC_SPACE_HDRS); -} - static void hdr_l2only_dest(void *vbuf, void *unused) { (void) unused; arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } static void buf_dest(void *vbuf, void *unused) { (void) unused; (void) vbuf; arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } static void buf_init(void) { uint64_t *ct = NULL; uint64_t hsize = 1ULL << 12; int i, j; /* * The hash table is big enough to fill all of physical memory * with an average block size of zfs_arc_average_blocksize (default 8K). * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ while (hsize * zfs_arc_average_blocksize < arc_all_memory()) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages * should be using vmem_alloc() in the linux kernel */ buf_hash_table.ht_table = vmem_zalloc(hsize * sizeof (void*), KM_SLEEP); #else buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); #endif if (buf_hash_table.ht_table == NULL) { ASSERT(hsize > (1ULL << 8)); hsize >>= 1; goto retry; } hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0); - hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt", - HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest, - NULL, NULL, NULL, 0); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); for (i = 0; i < BUF_LOCKS; i++) mutex_init(BUF_HASH_LOCK(i), NULL, MUTEX_DEFAULT, NULL); } #define ARC_MINTIME (hz>>4) /* 62 ms */ /* * This is the size that the buf occupies in memory. If the buf is compressed, * it will correspond to the compressed size. You should use this method of * getting the buf size unless you explicitly need the logical size. */ uint64_t arc_buf_size(arc_buf_t *buf) { return (ARC_BUF_COMPRESSED(buf) ? HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); } uint64_t arc_buf_lsize(arc_buf_t *buf) { return (HDR_GET_LSIZE(buf->b_hdr)); } /* * This function will return B_TRUE if the buffer is encrypted in memory. * This buffer can be decrypted by calling arc_untransform(). */ boolean_t arc_is_encrypted(arc_buf_t *buf) { return (ARC_BUF_ENCRYPTED(buf) != 0); } /* * Returns B_TRUE if the buffer represents data that has not had its MAC * verified yet. */ boolean_t arc_is_unauthenticated(arc_buf_t *buf) { return (HDR_NOAUTH(buf->b_hdr) != 0); } void arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt, uint8_t *iv, uint8_t *mac) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_PROTECTED(hdr)); memcpy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); memcpy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); memcpy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; } /* * Indicates how this buffer is compressed in memory. If it is not compressed * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with * arc_untransform() as long as it is also unencrypted. */ enum zio_compress arc_get_compression(arc_buf_t *buf) { return (ARC_BUF_COMPRESSED(buf) ? HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); } /* * Return the compression algorithm used to store this data in the ARC. If ARC * compression is enabled or this is an encrypted block, this will be the same * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF. */ static inline enum zio_compress arc_hdr_get_compress(arc_buf_hdr_t *hdr) { return (HDR_COMPRESSION_ENABLED(hdr) ? HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF); } uint8_t arc_get_complevel(arc_buf_t *buf) { return (buf->b_hdr->b_complevel); } static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { boolean_t shared = (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); IMPLY(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); /* * It would be nice to assert arc_can_share() too, but the "hdr isn't * already being shared" requirement prevents us from doing that. */ return (shared); } /* * Free the checksum associated with this header. If there is no checksum, this * is a no-op. */ static inline void arc_cksum_free(arc_buf_hdr_t *hdr) { #ifdef ZFS_DEBUG ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_l1hdr.b_freeze_cksum = NULL; } mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #endif } /* * Return true iff at least one of the bufs on hdr is not compressed. * Encrypted buffers count as compressed. */ static boolean_t arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) { ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr)); for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { if (!ARC_BUF_COMPRESSED(b)) { return (B_TRUE); } } return (B_FALSE); } /* * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data * matches the checksum that is stored in the hdr. If there is no checksum, * or if the buf is compressed, this is a no-op. */ static void arc_cksum_verify(arc_buf_t *buf) { #ifdef ZFS_DEBUG arc_buf_hdr_t *hdr = buf->b_hdr; zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #endif } /* * This function makes the assumption that data stored in the L2ARC * will be transformed exactly as it is in the main pool. Because of * this we can verify the checksum against the reading process's bp. */ static boolean_t arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); /* * Block pointers always store the checksum for the logical data. * If the block pointer has the gang bit set, then the checksum * it represents is for the reconstituted data and not for an * individual gang member. The zio pipeline, however, must be able to * determine the checksum of each of the gang constituents so it * treats the checksum comparison differently than what we need * for l2arc blocks. This prevents us from using the * zio_checksum_error() interface directly. Instead we must call the * zio_checksum_error_impl() so that we can ensure the checksum is * generated using the correct checksum algorithm and accounts for the * logical I/O size and not just a gang fragment. */ return (zio_checksum_error_impl(zio->io_spa, zio->io_bp, BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, zio->io_offset, NULL) == 0); } /* * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a * checksum and attaches it to the buf's hdr so that we can ensure that the buf * isn't modified later on. If buf is compressed or there is already a checksum * on the hdr, this is a no-op (we only checksum uncompressed bufs). */ static void arc_cksum_compute(arc_buf_t *buf) { if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; #ifdef ZFS_DEBUG arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(!ARC_BUF_COMPRESSED(buf)); hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, hdr->b_l1hdr.b_freeze_cksum); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #endif arc_buf_watch(buf); } #ifndef _KERNEL void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused) { (void) sig, (void) unused; panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr); } #endif static void arc_buf_unwatch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ | PROT_WRITE)); } #else (void) buf; #endif } static void arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ)); #else (void) buf; #endif } static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *hdr) { arc_buf_contents_t type; if (HDR_ISTYPE_METADATA(hdr)) { type = ARC_BUFC_METADATA; } else { type = ARC_BUFC_DATA; } VERIFY3U(hdr->b_type, ==, type); return (type); } boolean_t arc_is_metadata(arc_buf_t *buf) { return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); } static uint32_t arc_bufc_to_flags(arc_buf_contents_t type) { switch (type) { case ARC_BUFC_DATA: /* metadata field is 0 if buffer contains normal data */ return (0); case ARC_BUFC_METADATA: return (ARC_FLAG_BUFC_METADATA); default: break; } panic("undefined ARC buffer type!"); return ((uint32_t)-1); } void arc_buf_thaw(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_cksum_verify(buf); /* * Compressed buffers do not manipulate the b_freeze_cksum. */ if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(hdr)); arc_cksum_free(hdr); arc_buf_unwatch(buf); } void arc_buf_freeze(arc_buf_t *buf) { if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(buf->b_hdr)); arc_cksum_compute(buf); } /* * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, * the following functions should be used to ensure that the flags are * updated in a thread-safe way. When manipulating the flags either * the hash_lock must be held or the hdr must be undiscoverable. This * ensures that we're not racing with any other threads when updating * the flags. */ static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags |= flags; } static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags &= ~flags; } /* * Setting the compression bits in the arc_buf_hdr_t's b_flags is * done in a special way since we have to clear and set bits * at the same time. Consumers that wish to set the compression bits * must use this function to ensure that the flags are updated in * thread-safe manner. */ static void arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Holes and embedded blocks will always have a psize = 0 so * we ignore the compression of the blkptr and set the * want to uncompress them. Mark them as uncompressed. */ if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); } else { arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); ASSERT(HDR_COMPRESSION_ENABLED(hdr)); } HDR_SET_COMPRESS(hdr, cmp); ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); } /* * Looks for another buf on the same hdr which has the data decompressed, copies * from it, and returns true. If no such buf exists, returns false. */ static boolean_t arc_buf_try_copy_decompressed_data(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; boolean_t copied = B_FALSE; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(buf->b_data, !=, NULL); ASSERT(!ARC_BUF_COMPRESSED(buf)); for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; from = from->b_next) { /* can't use our own data buffer */ if (from == buf) { continue; } if (!ARC_BUF_COMPRESSED(from)) { memcpy(buf->b_data, from->b_data, arc_buf_size(buf)); copied = B_TRUE; break; } } #ifdef ZFS_DEBUG /* * There were no decompressed bufs, so there should not be a * checksum on the hdr either. */ if (zfs_flags & ZFS_DEBUG_MODIFY) EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); #endif return (copied); } /* * Allocates an ARC buf header that's in an evicted & L2-cached state. * This is used during l2arc reconstruction to make empty ARC buffers * which circumvent the regular disk->arc->l2arc path and instead come * into being in the reverse order, i.e. l2arc->arc. */ static arc_buf_hdr_t * arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth, enum zio_compress compress, uint8_t complevel, boolean_t protected, boolean_t prefetch, arc_state_type_t arcs_state) { arc_buf_hdr_t *hdr; ASSERT(size != 0); hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP); hdr->b_birth = birth; hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR); HDR_SET_LSIZE(hdr, size); HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); hdr->b_complevel = complevel; if (protected) arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); if (prefetch) arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa); hdr->b_dva = dva; hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_daddr = daddr; hdr->b_l2hdr.b_arcs_state = arcs_state; return (hdr); } /* * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. */ static uint64_t arc_hdr_size(arc_buf_hdr_t *hdr) { uint64_t size; if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && HDR_GET_PSIZE(hdr) > 0) { size = HDR_GET_PSIZE(hdr); } else { ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); size = HDR_GET_LSIZE(hdr); } return (size); } static int arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) { int ret; uint64_t csize; uint64_t lsize = HDR_GET_LSIZE(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); void *tmpbuf = NULL; abd_t *abd = hdr->b_l1hdr.b_pabd; ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_AUTHENTICATED(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * The MAC is calculated on the compressed data that is stored on disk. * However, if compressed arc is disabled we will only have the * decompressed data available to us now. Compress it into a temporary * abd so we can verify the MAC. The performance overhead of this will * be relatively low, since most objects in an encrypted objset will * be encrypted (instead of authenticated) anyway. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { csize = zio_compress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, &tmpbuf, lsize, hdr->b_complevel); ASSERT3P(tmpbuf, !=, NULL); ASSERT3U(csize, <=, psize); abd = abd_get_from_buf(tmpbuf, lsize); abd_take_ownership_of_buf(abd, B_TRUE); abd_zero_off(abd, csize, psize - csize); } /* * Authentication is best effort. We authenticate whenever the key is * available. If we succeed we clear ARC_FLAG_NOAUTH. */ if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) { ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); ASSERT3U(lsize, ==, psize); ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd, psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); } else { ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize, hdr->b_crypt_hdr.b_mac); } if (ret == 0) arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH); else if (ret != ENOENT) goto error; if (tmpbuf != NULL) abd_free(abd); return (0); error: if (tmpbuf != NULL) abd_free(abd); return (ret); } /* * This function will take a header that only has raw encrypted data in * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in * b_l1hdr.b_pabd. If designated in the header flags, this function will * also decompress the data. */ static int arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) { int ret; abd_t *cabd = NULL; void *tmp = NULL; boolean_t no_crypt = B_FALSE; boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_ENCRYPTED(hdr)); arc_hdr_alloc_abd(hdr, 0); ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot, B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd, &no_crypt); if (ret != 0) goto error; if (no_crypt) { abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd, HDR_GET_PSIZE(hdr)); } /* * If this header has disabled arc compression but the b_pabd is * compressed after decrypting it, we need to decompress the newly * decrypted data. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { /* * We want to make sure that we are correctly honoring the * zfs_abd_scatter_enabled setting, so we allocate an abd here * and then loan a buffer from it, rather than allocating a * linear buffer and wrapping it in an abd later. */ cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0); tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { abd_return_buf(cabd, tmp, arc_hdr_size(hdr)); goto error; } abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = cabd; } return (0); error: arc_hdr_free_abd(hdr, B_FALSE); if (cabd != NULL) arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr); return (ret); } /* * This function is called during arc_buf_fill() to prepare the header's * abd plaintext pointer for use. This involves authenticated protected * data and decrypting encrypted data into the plaintext abd. */ static int arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa, const zbookmark_phys_t *zb, boolean_t noauth) { int ret; ASSERT(HDR_PROTECTED(hdr)); if (hash_lock != NULL) mutex_enter(hash_lock); if (HDR_NOAUTH(hdr) && !noauth) { /* * The caller requested authenticated data but our data has * not been authenticated yet. Verify the MAC now if we can. */ ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset); if (ret != 0) goto error; } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) { /* * If we only have the encrypted version of the data, but the * unencrypted version was requested we take this opportunity * to store the decrypted version in the header for future use. */ ret = arc_hdr_decrypt(hdr, spa, zb); if (ret != 0) goto error; } ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); if (hash_lock != NULL) mutex_exit(hash_lock); return (0); error: if (hash_lock != NULL) mutex_exit(hash_lock); return (ret); } /* * This function is used by the dbuf code to decrypt bonus buffers in place. * The dbuf code itself doesn't have any locking for decrypting a shared dnode * block, so we use the hash lock here to protect against concurrent calls to * arc_buf_fill(). */ static void arc_buf_untransform_in_place(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_ENCRYPTED(hdr)); ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; - hdr->b_crypt_hdr.b_ebufcnt -= 1; } /* * Given a buf that has a data buffer attached to it, this function will * efficiently fill the buf with data of the specified compression setting from * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr * are already sharing a data buf, no copy is performed. * * If the buf is marked as compressed but uncompressed data was requested, this * will allocate a new data buffer for the buf, remove that flag, and fill the * buf with uncompressed data. You can't request a compressed buf on a hdr with * uncompressed data, and (since we haven't added support for it yet) if you * want compressed data your buf must already be marked as compressed and have * the correct-sized data buffer. */ static int arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, arc_fill_flags_t flags) { int error = 0; arc_buf_hdr_t *hdr = buf->b_hdr; boolean_t hdr_compressed = (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0; boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0; dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr); ASSERT3P(buf->b_data, !=, NULL); IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf)); IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); IMPLY(encrypted, HDR_ENCRYPTED(hdr)); IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf)); IMPLY(encrypted, ARC_BUF_COMPRESSED(buf)); IMPLY(encrypted, !ARC_BUF_SHARED(buf)); /* * If the caller wanted encrypted data we just need to copy it from * b_rabd and potentially byteswap it. We won't be able to do any * further transforms on it. */ if (encrypted) { ASSERT(HDR_HAS_RABD(hdr)); abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd, HDR_GET_PSIZE(hdr)); goto byteswap; } /* * Adjust encrypted and authenticated headers to accommodate * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are * allowed to fail decryption due to keys not being loaded * without being marked as an IO error. */ if (HDR_PROTECTED(hdr)) { error = arc_fill_hdr_crypt(hdr, hash_lock, spa, zb, !!(flags & ARC_FILL_NOAUTH)); if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) { return (error); } else if (error != 0) { if (hash_lock != NULL) mutex_enter(hash_lock); arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hash_lock != NULL) mutex_exit(hash_lock); return (error); } } /* * There is a special case here for dnode blocks which are * decrypting their bonus buffers. These blocks may request to * be decrypted in-place. This is necessary because there may * be many dnodes pointing into this buffer and there is * currently no method to synchronize replacing the backing * b_data buffer and updating all of the pointers. Here we use * the hash lock to ensure there are no races. If the need * arises for other types to be decrypted in-place, they must * add handling here as well. */ if ((flags & ARC_FILL_IN_PLACE) != 0) { ASSERT(!hdr_compressed); ASSERT(!compressed); ASSERT(!encrypted); if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); if (hash_lock != NULL) mutex_enter(hash_lock); arc_buf_untransform_in_place(buf); if (hash_lock != NULL) mutex_exit(hash_lock); /* Compute the hdr's checksum if necessary */ arc_cksum_compute(buf); } return (0); } if (hdr_compressed == compressed) { if (!arc_buf_is_shared(buf)) { abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, arc_buf_size(buf)); } } else { ASSERT(hdr_compressed); ASSERT(!compressed); /* * If the buf is sharing its data with the hdr, unlink it and * allocate a new data buffer for the buf. */ if (arc_buf_is_shared(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); /* We need to give the buf its own b_data */ buf->b_flags &= ~ARC_BUF_FLAG_SHARED; buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); /* Previously overhead was 0; just add new overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } else if (ARC_BUF_COMPRESSED(buf)) { /* We need to reallocate the buf's b_data */ arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), buf); buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); /* We increased the size of b_data; update overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); } /* * Regardless of the buf's previous compression settings, it * should not be compressed at the end of this function. */ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; /* * Try copying the data from another buf which already has a * decompressed version. If that's not possible, it's time to * bite the bullet and decompress the data from the hdr. */ if (arc_buf_try_copy_decompressed_data(buf)) { /* Skip byteswapping and checksumming (already done) */ return (0); } else { error = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, buf->b_data, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); /* * Absent hardware errors or software bugs, this should * be impossible, but log it anyway so we can debug it. */ if (error != 0) { zfs_dbgmsg( "hdr %px, compress %d, psize %d, lsize %d", hdr, arc_hdr_get_compress(hdr), HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); if (hash_lock != NULL) mutex_enter(hash_lock); arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hash_lock != NULL) mutex_exit(hash_lock); return (SET_ERROR(EIO)); } } } byteswap: /* Byteswap the buf's data if necessary */ if (bswap != DMU_BSWAP_NUMFUNCS) { ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); } /* Compute the hdr's checksum if necessary */ arc_cksum_compute(buf); return (0); } /* * If this function is being called to decrypt an encrypted buffer or verify an * authenticated one, the key must be loaded and a mapping must be made * available in the keystore via spa_keystore_create_mapping() or one of its * callers. */ int arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, boolean_t in_place) { int ret; arc_fill_flags_t flags = 0; if (in_place) flags |= ARC_FILL_IN_PLACE; ret = arc_buf_fill(buf, spa, zb, flags); if (ret == ECKSUM) { /* * Convert authentication and decryption errors to EIO * (and generate an ereport) before leaving the ARC. */ ret = SET_ERROR(EIO); spa_log_error(spa, zb, &buf->b_hdr->b_birth); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0); } return (ret); } /* * Increment the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { - ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); (void) zfs_refcount_add_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_add_many(&state->arcs_esize[type], HDR_GET_PSIZE(hdr), hdr); } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } } /* * Decrement the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { - ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); (void) zfs_refcount_remove_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_remove_many(&state->arcs_esize[type], HDR_GET_PSIZE(hdr), hdr); } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } } /* * Add a reference to this hdr indicating that someone is actively * referencing that memory. When the refcount transitions from 0 to 1, * we remove it from the respective arc_state_t list to indicate that * it is not evictable. */ static void add_reference(arc_buf_hdr_t *hdr, const void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) { ASSERT(state == arc_anon); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); } if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && state != arc_anon && state != arc_l2c_only) { /* We don't use the L2-only state list. */ multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr); arc_evictable_space_decrement(hdr, state); } } /* * Remove a reference from this hdr. When the reference transitions from * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's * list making it eligible for eviction. */ static int remove_reference(arc_buf_hdr_t *hdr, const void *tag) { int cnt; arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(!GHOST_STATE(state)); /* arc_l2c_only counts as a ghost. */ if ((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) != 0) return (cnt); if (state == arc_anon) { arc_hdr_destroy(hdr); return (0); } if (state == arc_uncached && !HDR_PREFETCH(hdr)) { arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); return (0); } multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); arc_evictable_space_increment(hdr, state); return (0); } /* * Returns detailed information about a specific arc buffer. When the * state_index argument is set the function will calculate the arc header * list position for its arc state. Since this requires a linear traversal * callers are strongly encourage not to do this. However, it can be helpful * for targeted analysis so the functionality is provided. */ void arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) { (void) state_index; arc_buf_hdr_t *hdr = ab->b_hdr; l1arc_buf_hdr_t *l1hdr = NULL; l2arc_buf_hdr_t *l2hdr = NULL; arc_state_t *state = NULL; memset(abi, 0, sizeof (arc_buf_info_t)); if (hdr == NULL) return; abi->abi_flags = hdr->b_flags; if (HDR_HAS_L1HDR(hdr)) { l1hdr = &hdr->b_l1hdr; state = l1hdr->b_state; } if (HDR_HAS_L2HDR(hdr)) l2hdr = &hdr->b_l2hdr; if (l1hdr) { - abi->abi_bufcnt = l1hdr->b_bufcnt; + abi->abi_bufcnt = 0; + for (arc_buf_t *buf = l1hdr->b_buf; buf; buf = buf->b_next) + abi->abi_bufcnt++; abi->abi_access = l1hdr->b_arc_access; abi->abi_mru_hits = l1hdr->b_mru_hits; abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; abi->abi_mfu_hits = l1hdr->b_mfu_hits; abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits; abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt); } if (l2hdr) { abi->abi_l2arc_dattr = l2hdr->b_daddr; abi->abi_l2arc_hits = l2hdr->b_hits; } abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; abi->abi_state_contents = arc_buf_type(hdr); abi->abi_size = arc_hdr_size(hdr); } /* * Move the supplied buffer to the indicated state. The hash lock * for the buffer must be held by the caller. */ static void arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) { arc_state_t *old_state; int64_t refcnt; - uint32_t bufcnt; boolean_t update_old, update_new; arc_buf_contents_t type = arc_buf_type(hdr); /* * We almost always have an L1 hdr here, since we call arc_hdr_realloc() * in arc_read() when bringing a buffer out of the L2ARC. However, the * L1 hdr doesn't always exist when we change state to arc_anon before * destroying a header, in which case reallocating to add the L1 hdr is * pointless. */ if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt); - bufcnt = hdr->b_l1hdr.b_bufcnt; - update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL || - HDR_HAS_RABD(hdr)); + update_old = (hdr->b_l1hdr.b_buf != NULL || + hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); - IMPLY(GHOST_STATE(old_state), bufcnt == 0); - IMPLY(GHOST_STATE(new_state), bufcnt == 0); IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL); IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL); - IMPLY(old_state == arc_anon, bufcnt <= 1); + IMPLY(old_state == arc_anon, hdr->b_l1hdr.b_buf == NULL || + ARC_BUF_LAST(hdr->b_l1hdr.b_buf)); } else { old_state = arc_l2c_only; refcnt = 0; - bufcnt = 0; update_old = B_FALSE; } update_new = update_old; if (GHOST_STATE(old_state)) update_old = B_TRUE; if (GHOST_STATE(new_state)) update_new = B_TRUE; ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT3P(new_state, !=, old_state); /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); /* remove_reference() saves on insert. */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { multilist_remove(&old_state->arcs_list[type], hdr); arc_evictable_space_decrement(hdr, old_state); } } if (new_state != arc_anon && new_state != arc_l2c_only) { /* * An L1 header always exists here, since if we're * moving to some L1-cached state (i.e. not l2c_only or * anonymous), we realloc the header to add an L1hdr * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(&new_state->arcs_list[type], hdr); arc_evictable_space_increment(hdr, new_state); } } ASSERT(!HDR_EMPTY(hdr)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); /* adjust state sizes (ignore arc_l2c_only) */ if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { - ASSERT0(bufcnt); /* * When moving a header to a ghost state, we first - * remove all arc buffers. Thus, we'll have a - * bufcnt of zero, and no arc buffer to use for - * the reference. As a result, we use the arc - * header pointer for the reference. + * remove all arc buffers. Thus, we'll have no arc + * buffer to use for the reference. As a result, we + * use the arc header pointer for the reference. */ (void) zfs_refcount_add_many( &new_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); } else { - uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - ASSERT3U(bufcnt, !=, 0); - buffers++; /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_add_many( &new_state->arcs_size[type], arc_buf_size(buf), buf); } - ASSERT3U(bufcnt, ==, buffers); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many( &new_state->arcs_size[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_add_many( &new_state->arcs_size[type], HDR_GET_PSIZE(hdr), hdr); } } } if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { - ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); /* * When moving a header off of a ghost state, * the header will not contain any arc buffers. * We use the arc header pointer for the reference * which is exactly what we did when we put the * header on the ghost state. */ (void) zfs_refcount_remove_many( &old_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); } else { - uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - ASSERT3U(bufcnt, !=, 0); - buffers++; /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_remove_many( &old_state->arcs_size[type], arc_buf_size(buf), buf); } - ASSERT3U(bufcnt, ==, buffers); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many( &old_state->arcs_size[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_remove_many( &old_state->arcs_size[type], HDR_GET_PSIZE(hdr), hdr); } } } if (HDR_HAS_L1HDR(hdr)) { hdr->b_l1hdr.b_state = new_state; if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) { l2arc_hdr_arcstats_decrement_state(hdr); hdr->b_l2hdr.b_arcs_state = new_state->arcs_state; l2arc_hdr_arcstats_increment_state(hdr); } } } void arc_space_consume(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { default: break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, space); break; case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, space); break; case ARC_SPACE_BONUS: ARCSTAT_INCR(arcstat_bonus_size, space); break; case ARC_SPACE_DNODE: ARCSTAT_INCR(arcstat_dnode_size, space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, space); break; case ARC_SPACE_L2HDRS: aggsum_add(&arc_sums.arcstat_l2_hdr_size, space); break; case ARC_SPACE_ABD_CHUNK_WASTE: /* * Note: this includes space wasted by all scatter ABD's, not * just those allocated by the ARC. But the vast majority of * scatter ABD's come from the ARC, because other users are * very short-lived. */ ARCSTAT_INCR(arcstat_abd_chunk_waste_size, space); break; } if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) ARCSTAT_INCR(arcstat_meta_used, space); aggsum_add(&arc_sums.arcstat_size, space); } void arc_space_return(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { default: break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, -space); break; case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, -space); break; case ARC_SPACE_BONUS: ARCSTAT_INCR(arcstat_bonus_size, -space); break; case ARC_SPACE_DNODE: ARCSTAT_INCR(arcstat_dnode_size, -space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, -space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, -space); break; case ARC_SPACE_L2HDRS: aggsum_add(&arc_sums.arcstat_l2_hdr_size, -space); break; case ARC_SPACE_ABD_CHUNK_WASTE: ARCSTAT_INCR(arcstat_abd_chunk_waste_size, -space); break; } if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) ARCSTAT_INCR(arcstat_meta_used, -space); ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0); aggsum_add(&arc_sums.arcstat_size, -space); } /* * Given a hdr and a buf, returns whether that buf can share its b_data buffer * with the hdr's b_pabd. */ static boolean_t arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) { /* * The criteria for sharing a hdr's data are: * 1. the buffer is not encrypted * 2. the hdr's compression matches the buf's compression * 3. the hdr doesn't need to be byteswapped * 4. the hdr isn't already being shared * 5. the buf is either compressed or it is the last buf in the hdr list * * Criterion #5 maintains the invariant that shared uncompressed * bufs must be the final buf in the hdr's b_buf list. Reading this, you * might ask, "if a compressed buf is allocated first, won't that be the * last thing in the list?", but in that case it's impossible to create * a shared uncompressed buf anyway (because the hdr must be compressed * to have the compressed buf). You might also think that #3 is * sufficient to make this guarantee, however it's possible * (specifically in the rare L2ARC write race mentioned in * arc_buf_alloc_impl()) there will be an existing uncompressed buf that * is shareable, but wasn't at the time of its allocation. Rather than * allow a new shared uncompressed buf to be created and then shuffle * the list around to make it the last element, this simply disallows * sharing if the new buf isn't the first to be added. */ ASSERT3P(buf->b_hdr, ==, hdr); boolean_t hdr_compressed = arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF; boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; return (!ARC_BUF_ENCRYPTED(buf) && buf_compressed == hdr_compressed && hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && !HDR_SHARED_DATA(hdr) && (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); } /* * Allocate a buf for this hdr. If you care about the data that's in the hdr, * or if you want a compressed buffer, pass those flags in. Returns 0 if the * copy was made successfully, or an error code otherwise. */ static int arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, const void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth, boolean_t fill, arc_buf_t **ret) { arc_buf_t *buf; arc_fill_flags_t flags = ARC_FILL_LOCKED; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); VERIFY(hdr->b_type == ARC_BUFC_DATA || hdr->b_type == ARC_BUFC_METADATA); ASSERT3P(ret, !=, NULL); ASSERT3P(*ret, ==, NULL); IMPLY(encrypted, compressed); buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_next = hdr->b_l1hdr.b_buf; buf->b_flags = 0; add_reference(hdr, tag); /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Only honor requests for compressed bufs if the hdr is actually * compressed. This must be overridden if the buffer is encrypted since * encrypted buffers cannot be decompressed. */ if (encrypted) { buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED; flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED; } else if (compressed && arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; flags |= ARC_FILL_COMPRESSED; } if (noauth) { ASSERT0(encrypted); flags |= ARC_FILL_NOAUTH; } /* * If the hdr's data can be shared then we share the data buffer and * set the appropriate bit in the hdr's b_flags to indicate the hdr is * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new * buffer to store the buf's data. * * There are two additional restrictions here because we're sharing * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be * actively involved in an L2ARC write, because if this buf is used by * an arc_write() then the hdr's data buffer will be released when the * write completes, even though the L2ARC write might still be using it. * Second, the hdr's ABD must be linear so that the buf's user doesn't * need to be ABD-aware. It must be allocated via * zio_[data_]buf_alloc(), not as a page, because we need to be able * to abd_release_ownership_of_buf(), which isn't allowed on "linear * page" buffers because the ABD code needs to handle freeing them * specially. */ boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd) && !abd_is_linear_page(hdr->b_l1hdr.b_pabd); /* Set up b_data and sharing */ if (can_share) { buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); buf->b_flags |= ARC_BUF_FLAG_SHARED; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); } else { buf->b_data = arc_get_data_buf(hdr, arc_buf_size(buf), buf); ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; - hdr->b_l1hdr.b_bufcnt += 1; - if (encrypted) - hdr->b_crypt_hdr.b_ebufcnt += 1; /* * If the user wants the data from the hdr, we need to either copy or * decompress the data. */ if (fill) { ASSERT3P(zb, !=, NULL); return (arc_buf_fill(buf, spa, zb, flags)); } return (0); } static const char *arc_onloan_tag = "onloan"; static inline void arc_loaned_bytes_update(int64_t delta) { atomic_add_64(&arc_loaned_bytes, delta); /* assert that it did not wrap around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); } /* * Loan out an anonymous arc buffer. Loaned buffers are not counted as in * flight data by arc_tempreserve_space() until they are "returned". Loaned * buffers must be returned to the arc before they can be used by the DMU or * freed. */ arc_buf_t * arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) { arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); arc_loaned_bytes_update(arc_buf_size(buf)); return (buf); } arc_buf_t * arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, psize, lsize, compression_type, complevel); arc_loaned_bytes_update(arc_buf_size(buf)); return (buf); } arc_buf_t * arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj, byteorder, salt, iv, mac, ot, psize, lsize, compression_type, complevel); atomic_add_64(&arc_loaned_bytes, psize); return (buf); } /* * Return a loaned arc buffer to the arc. */ void arc_return_buf(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); arc_loaned_bytes_update(-arc_buf_size(buf)); } /* Detach an arc_buf from a dbuf (tag) */ void arc_loan_inuse_buf(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); arc_loaned_bytes_update(arc_buf_size(buf)); } static void l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) { l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); df->l2df_abd = abd; df->l2df_size = size; df->l2df_type = type; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); } static void arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, hdr); } (void) zfs_refcount_remove_many(&state->arcs_size[type], size, hdr); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_space_return(size, ARC_SPACE_DATA); } if (free_rdata) { l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type); } else { l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); } } /* * Share the arc_buf_t's data with the hdr. Whenever we are sharing the * data buffer, we transfer the refcount ownership to the hdr and update * the appropriate kstats. */ static void arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_can_share(hdr, buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Start sharing the data buffer. We transfer the * refcount ownership to the hdr since it always owns * the refcount whenever an arc_buf_t is shared. */ zfs_refcount_transfer_ownership_many( &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)], arc_hdr_size(hdr), buf, hdr); hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, HDR_ISTYPE_METADATA(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); buf->b_flags |= ARC_BUF_FLAG_SHARED; /* * Since we've transferred ownership to the hdr we need * to increment its compressed and uncompressed kstats and * decrement the overhead size. */ ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); } static void arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * We are no longer sharing this buffer so we need * to transfer its ownership to the rightful owner. */ zfs_refcount_transfer_ownership_many( &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)], arc_hdr_size(hdr), hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); abd_free(hdr->b_l1hdr.b_pabd); hdr->b_l1hdr.b_pabd = NULL; buf->b_flags &= ~ARC_BUF_FLAG_SHARED; /* * Since the buffer is no longer shared between * the arc buf and the hdr, count it as overhead. */ ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } /* * Remove an arc_buf_t from the hdr's buf list and return the last * arc_buf_t on the list. If no buffers remain on the list then return * NULL. */ static arc_buf_t * arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; arc_buf_t *lastbuf = NULL; /* * Remove the buf from the hdr list and locate the last * remaining buffer on the list. */ while (*bufp != NULL) { if (*bufp == buf) *bufp = buf->b_next; /* * If we've removed a buffer in the middle of * the list then update the lastbuf and update * bufp. */ if (*bufp != NULL) { lastbuf = *bufp; bufp = &(*bufp)->b_next; } } buf->b_next = NULL; ASSERT3P(lastbuf, !=, buf); - IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); - IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); return (lastbuf); } /* * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's * list and free it. */ static void arc_buf_destroy_impl(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * Free up the data associated with the buf but only if we're not * sharing this with the hdr. If we are sharing it with the hdr, the * hdr is responsible for doing the free. */ if (buf->b_data != NULL) { /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); arc_cksum_verify(buf); arc_buf_unwatch(buf); if (arc_buf_is_shared(buf)) { arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { uint64_t size = arc_buf_size(buf); arc_free_data_buf(hdr, buf->b_data, size, buf); ARCSTAT_INCR(arcstat_overhead_size, -size); } buf->b_data = NULL; - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); - hdr->b_l1hdr.b_bufcnt -= 1; - - if (ARC_BUF_ENCRYPTED(buf)) { - hdr->b_crypt_hdr.b_ebufcnt -= 1; - - /* - * If we have no more encrypted buffers and we've - * already gotten a copy of the decrypted data we can - * free b_rabd to save some space. - */ - if (hdr->b_crypt_hdr.b_ebufcnt == 0 && - HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL && - !HDR_IO_IN_PROGRESS(hdr)) { - arc_hdr_free_abd(hdr, B_TRUE); + /* + * If we have no more encrypted buffers and we've already + * gotten a copy of the decrypted data we can free b_rabd + * to save some space. + */ + if (ARC_BUF_ENCRYPTED(buf) && HDR_HAS_RABD(hdr) && + hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) { + arc_buf_t *b; + for (b = hdr->b_l1hdr.b_buf; b; b = b->b_next) { + if (b != buf && ARC_BUF_ENCRYPTED(b)) + break; } + if (b == NULL) + arc_hdr_free_abd(hdr, B_TRUE); } } arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { /* * If the current arc_buf_t is sharing its data buffer with the * hdr, then reassign the hdr's b_pabd to share it with the new * buffer at the end of the list. The shared buffer is always * the last one on the hdr's buffer list. * * There is an equivalent case for compressed bufs, but since * they aren't guaranteed to be the last buf in the list and * that is an exceedingly rare case, we just allow that space be * wasted temporarily. We must also be careful not to share * encrypted buffers, since they cannot be shared. */ if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) { /* Only one buf can be shared at once */ VERIFY(!arc_buf_is_shared(lastbuf)); /* hdr is uncompressed so can't have compressed buf */ VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); arc_hdr_free_abd(hdr, B_FALSE); /* * We must setup a new shared block between the * last buffer and the hdr. The data would have * been allocated by the arc buf so we need to transfer * ownership to the hdr since it's now being shared. */ arc_share_buf(hdr, lastbuf); } } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end * of the list. Compressed buffers don't have the * same requirements. This makes it hard to * simply assert that the lastbuf is shared so * we rely on the hdr's compression flags to determine * if we have a compressed, shared buffer. */ ASSERT3P(lastbuf, !=, NULL); ASSERT(arc_buf_is_shared(lastbuf) || arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); } /* * Free the checksum if we're removing the last uncompressed buf from * this hdr. */ if (!arc_hdr_has_uncompressed_buf(hdr)) { arc_cksum_free(hdr); } /* clean up the buf */ buf->b_hdr = NULL; kmem_cache_free(buf_cache, buf); } static void arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags) { uint64_t size; boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata); IMPLY(alloc_rdata, HDR_PROTECTED(hdr)); if (alloc_rdata) { size = HDR_GET_PSIZE(hdr); ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL); hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr, alloc_flags); ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL); ARCSTAT_INCR(arcstat_raw_size, size); } else { size = arc_hdr_size(hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr, alloc_flags); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); } ARCSTAT_INCR(arcstat_compressed_size, size); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); } static void arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata) { uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); IMPLY(free_rdata, HDR_HAS_RABD(hdr)); /* * If the hdr is currently being written to the l2arc then * we defer freeing the data by adding it to the l2arc_free_on_write * list. The l2arc will free the data once it's finished * writing it to the l2arc device. */ if (HDR_L2_WRITING(hdr)) { arc_hdr_free_on_write(hdr, free_rdata); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else if (free_rdata) { arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr); } else { arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr); } if (free_rdata) { hdr->b_crypt_hdr.b_rabd = NULL; ARCSTAT_INCR(arcstat_raw_size, -size); } else { hdr->b_l1hdr.b_pabd = NULL; } if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr)) hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; ARCSTAT_INCR(arcstat_compressed_size, -size); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); } /* * Allocate empty anonymous ARC header. The header will get its identity * assigned and buffers attached later as part of read or write operations. * * In case of read arc_read() assigns header its identify (b_dva + b_birth), * inserts it into ARC hash to become globally visible and allocates physical * (b_pabd) or raw (b_rabd) ABD buffer to read into from disk. On disk read * completion arc_read_done() allocates ARC buffer(s) as needed, potentially * sharing one of them with the physical ABD buffer. * * In case of write arc_alloc_buf() allocates ARC buffer to be filled with * data. Then after compression and/or encryption arc_write_ready() allocates * and fills (or potentially shares) physical (b_pabd) or raw (b_rabd) ABD * buffer. On disk write completion arc_write_done() assigns the header its * new identity (b_dva + b_birth) and inserts into ARC hash. * * In case of partial overwrite the old data is read first as described. Then * arc_release() either allocates new anonymous ARC header and moves the ARC * buffer to it, or reuses the old ARC header by discarding its identity and * removing it from ARC hash. After buffer modification normal write process * follows as described. */ static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, boolean_t protected, enum zio_compress compression_type, uint8_t complevel, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); - if (protected) { - hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE); - } else { - hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); - } + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); ASSERT(HDR_EMPTY(hdr)); #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif HDR_SET_PSIZE(hdr, psize); HDR_SET_LSIZE(hdr, lsize); hdr->b_spa = spa; hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); arc_hdr_set_compress(hdr, compression_type); hdr->b_complevel = complevel; if (protected) arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; - hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_buf = NULL; ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); } /* * Transition between the two allocation states for the arc_buf_hdr struct. * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller * version is used when a cache buffer is only in the L2ARC in order to reduce * memory usage. */ static arc_buf_hdr_t * arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) { ASSERT(HDR_HAS_L2HDR(hdr)); arc_buf_hdr_t *nhdr; l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || (old == hdr_l2only_cache && new == hdr_full_cache)); - /* - * if the caller wanted a new full header and the header is to be - * encrypted we will actually allocate the header from the full crypt - * cache instead. The same applies to freeing from the old cache. - */ - if (HDR_PROTECTED(hdr) && new == hdr_full_cache) - new = hdr_full_crypt_cache; - if (HDR_PROTECTED(hdr) && old == hdr_full_cache) - old = hdr_full_crypt_cache; - nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); buf_hash_remove(hdr); memcpy(nhdr, hdr, HDR_L2ONLY_SIZE); - if (new == hdr_full_cache || new == hdr_full_crypt_cache) { + if (new == hdr_full_cache) { arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); /* * arc_access and arc_change_state need to be aware that a * header has just come out of L2ARC, so we set its state to * l2c_only even though it's about to change. */ nhdr->b_l1hdr.b_state = arc_l2c_only; /* Verify previous threads set to NULL before freeing */ ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT0(hdr->b_l1hdr.b_bufcnt); #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif /* * If we've reached here, We must have been called from * arc_evict_hdr(), as such we should have already been * removed from any ghost list we were previously on * (which protects us from racing with arc_evict_state), * thus no locking is needed during this check. */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); /* * A buffer must not be moved into the arc_l2c_only * state if it's not finished being written out to the * l2arc device. Otherwise, the b_l1hdr.b_pabd field * might try to be accessed, even though it was removed. */ VERIFY(!HDR_L2_WRITING(hdr)); VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); } /* * The header has been reallocated so we need to re-insert it into any * lists it was on. */ (void) buf_hash_insert(nhdr, NULL); ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); mutex_enter(&dev->l2ad_mtx); /* * We must place the realloc'ed header back into the list at * the same spot. Otherwise, if it's placed earlier in the list, * l2arc_write_buffers() could find it during the function's * write phase, and try to write it out to the l2arc. */ list_insert_after(&dev->l2ad_buflist, hdr, nhdr); list_remove(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); /* * Since we're using the pointer address as the tag when * incrementing and decrementing the l2ad_alloc refcount, we * must remove the old pointer (that we're about to destroy) and * add the new pointer to the refcount. Otherwise we'd remove * the wrong pointer address when calling arc_hdr_destroy() later. */ (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); buf_discard_identity(hdr); kmem_cache_free(old, hdr); return (nhdr); } -/* - * This function allows an L1 header to be reallocated as a crypt - * header and vice versa. If we are going to a crypt header, the - * new fields will be zeroed out. - */ -static arc_buf_hdr_t * -arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) -{ - arc_buf_hdr_t *nhdr; - arc_buf_t *buf; - kmem_cache_t *ncache, *ocache; - - /* - * This function requires that hdr is in the arc_anon state. - * Therefore it won't have any L2ARC data for us to worry - * about copying. - */ - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(!HDR_HAS_L2HDR(hdr)); - ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt); - ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); - ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node)); - ASSERT3P(hdr->b_hash_next, ==, NULL); - - if (need_crypt) { - ncache = hdr_full_crypt_cache; - ocache = hdr_full_cache; - } else { - ncache = hdr_full_cache; - ocache = hdr_full_crypt_cache; - } - - nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE); - - /* - * Copy all members that aren't locks or condvars to the new header. - * No lists are pointing to us (as we asserted above), so we don't - * need to worry about the list nodes. - */ - nhdr->b_dva = hdr->b_dva; - nhdr->b_birth = hdr->b_birth; - nhdr->b_type = hdr->b_type; - nhdr->b_flags = hdr->b_flags; - nhdr->b_psize = hdr->b_psize; - nhdr->b_lsize = hdr->b_lsize; - nhdr->b_spa = hdr->b_spa; -#ifdef ZFS_DEBUG - nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum; -#endif - nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt; - nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap; - nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state; - nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access; - nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits; - nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; - nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits; - nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; - nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb; - nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd; - - /* - * This zfs_refcount_add() exists only to ensure that the individual - * arc buffers always point to a header that is referenced, avoiding - * a small race condition that could trigger ASSERTs. - */ - (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG); - nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf; - for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) - buf->b_hdr = nhdr; - - zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt); - (void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG); - ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); - - if (need_crypt) { - arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED); - } else { - arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED); - } - - /* unset all members of the original hdr */ - memset(&hdr->b_dva, 0, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_type = 0; - hdr->b_flags = 0; - hdr->b_psize = 0; - hdr->b_lsize = 0; - hdr->b_spa = 0; -#ifdef ZFS_DEBUG - hdr->b_l1hdr.b_freeze_cksum = NULL; -#endif - hdr->b_l1hdr.b_buf = NULL; - hdr->b_l1hdr.b_bufcnt = 0; - hdr->b_l1hdr.b_byteswap = 0; - hdr->b_l1hdr.b_state = NULL; - hdr->b_l1hdr.b_arc_access = 0; - hdr->b_l1hdr.b_mru_hits = 0; - hdr->b_l1hdr.b_mru_ghost_hits = 0; - hdr->b_l1hdr.b_mfu_hits = 0; - hdr->b_l1hdr.b_mfu_ghost_hits = 0; - hdr->b_l1hdr.b_acb = NULL; - hdr->b_l1hdr.b_pabd = NULL; - - if (ocache == hdr_full_crypt_cache) { - ASSERT(!HDR_HAS_RABD(hdr)); - hdr->b_crypt_hdr.b_ot = DMU_OT_NONE; - hdr->b_crypt_hdr.b_ebufcnt = 0; - hdr->b_crypt_hdr.b_dsobj = 0; - memset(hdr->b_crypt_hdr.b_salt, 0, ZIO_DATA_SALT_LEN); - memset(hdr->b_crypt_hdr.b_iv, 0, ZIO_DATA_IV_LEN); - memset(hdr->b_crypt_hdr.b_mac, 0, ZIO_DATA_MAC_LEN); - } - - buf_discard_identity(hdr); - kmem_cache_free(ocache, hdr); - - return (nhdr); -} - /* * This function is used by the send / receive code to convert a newly * allocated arc_buf_t to one that is suitable for a raw encrypted write. It * is also used to allow the root objset block to be updated without altering * its embedded MACs. Both block types will always be uncompressed so we do not * have to worry about compression type or psize. */ void arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED); - if (!HDR_PROTECTED(hdr)) - hdr = arc_hdr_realloc_crypt(hdr, B_TRUE); + arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); if (!arc_hdr_has_uncompressed_buf(hdr)) arc_cksum_free(hdr); if (salt != NULL) memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); if (iv != NULL) memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); if (mac != NULL) memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); } /* * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. * The buf is returned thawed since we expect the consumer to modify it. */ arc_buf_t * arc_alloc_buf(spa_t *spa, const void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, B_FALSE, ZIO_COMPRESS_OFF, 0, type); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); return (buf); } /* * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this * for bufs containing metadata. */ arc_buf_t * arc_alloc_compressed_buf(spa_t *spa, const void *tag, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF); ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_FALSE, compression_type, complevel, ARC_BUFC_DATA); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_TRUE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); /* * To ensure that the hdr has the correct data in it if we call * arc_untransform() on this buf before it's been written to disk, * it's easiest if we just set up sharing between the buf and the hdr. */ arc_share_buf(hdr, buf); return (buf); } arc_buf_t * arc_alloc_raw_buf(spa_t *spa, const void *tag, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_hdr_t *hdr; arc_buf_t *buf; arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ? ARC_BUFC_METADATA : ARC_BUFC_DATA; ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF); ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, compression_type, complevel, type); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); /* * This buffer will be considered encrypted even if the ot is not an * encrypted type. It will become authenticated instead in * arc_write_ready(). */ buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); return (buf); } static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, boolean_t state_only) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; uint64_t lsize = HDR_GET_LSIZE(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); arc_buf_contents_t type = hdr->b_type; int64_t lsize_s; int64_t psize_s; int64_t asize_s; if (incr) { lsize_s = lsize; psize_s = psize; asize_s = asize; } else { lsize_s = -lsize; psize_s = -psize; asize_s = -asize; } /* If the buffer is a prefetch, count it as such. */ if (HDR_PREFETCH(hdr)) { ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s); } else { /* * We use the value stored in the L2 header upon initial * caching in L2ARC. This value will be updated in case * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC * metadata (log entry) cannot currently be updated. Having * the ARC state in the L2 header solves the problem of a * possibly absent L1 header (apparent in buffers restored * from persistent L2ARC). */ switch (hdr->b_l2hdr.b_arcs_state) { case ARC_STATE_MRU_GHOST: case ARC_STATE_MRU: ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s); break; case ARC_STATE_MFU_GHOST: case ARC_STATE_MFU: ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s); break; default: break; } } if (state_only) return; ARCSTAT_INCR(arcstat_l2_psize, psize_s); ARCSTAT_INCR(arcstat_l2_lsize, lsize_s); switch (type) { case ARC_BUFC_DATA: ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s); break; case ARC_BUFC_METADATA: ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s); break; default: break; } } static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); ASSERT(HDR_HAS_L2HDR(hdr)); list_remove(&dev->l2ad_buflist, hdr); l2arc_hdr_arcstats_decrement(hdr); vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); } static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { - ASSERT(hdr->b_l1hdr.b_buf == NULL || - hdr->b_l1hdr.b_bufcnt > 0); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); if (!buflist_held) mutex_enter(&dev->l2ad_mtx); /* * Even though we checked this conditional above, we * need to check this again now that we have the * l2ad_mtx. This is because we could be racing with * another thread calling l2arc_evict() which might have * destroyed this header's L2 portion as we were waiting * to acquire the l2ad_mtx. If that happens, we don't * want to re-destroy the header's L2 portion. */ if (HDR_HAS_L2HDR(hdr)) { if (!HDR_EMPTY(hdr)) buf_discard_identity(hdr); arc_hdr_l2hdr_destroy(hdr); } if (!buflist_held) mutex_exit(&dev->l2ad_mtx); } /* * The header's identify can only be safely discarded once it is no * longer discoverable. This requires removing it from the hash table * and the l2arc header list. After this point the hash lock can not * be used to protect the header. */ if (!HDR_EMPTY(hdr)) buf_discard_identity(hdr); if (HDR_HAS_L1HDR(hdr)) { arc_cksum_free(hdr); while (hdr->b_l1hdr.b_buf != NULL) arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); if (hdr->b_l1hdr.b_pabd != NULL) arc_hdr_free_abd(hdr, B_FALSE); if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); } ASSERT3P(hdr->b_hash_next, ==, NULL); if (HDR_HAS_L1HDR(hdr)) { ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif - - if (!HDR_PROTECTED(hdr)) { - kmem_cache_free(hdr_full_cache, hdr); - } else { - kmem_cache_free(hdr_full_crypt_cache, hdr); - } + kmem_cache_free(hdr_full_cache, hdr); } else { kmem_cache_free(hdr_l2only_cache, hdr); } } void arc_buf_destroy(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; if (hdr->b_l1hdr.b_state == arc_anon) { - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf); + ASSERT(ARC_BUF_LAST(buf)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); VERIFY0(remove_reference(hdr, tag)); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT3P(hdr, ==, buf->b_hdr); - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); ASSERT3P(buf->b_data, !=, NULL); arc_buf_destroy_impl(buf); (void) remove_reference(hdr, tag); mutex_exit(hash_lock); } /* * Evict the arc_buf_hdr that is provided as a parameter. The resultant * state of the header is dependent on its state prior to entering this * function. The following transitions are possible: * * - arc_mru -> arc_mru_ghost * - arc_mfu -> arc_mfu_ghost * - arc_mru_ghost -> arc_l2c_only * - arc_mru_ghost -> deleted * - arc_mfu_ghost -> arc_l2c_only * - arc_mfu_ghost -> deleted * - arc_uncached -> deleted * * Return total size of evicted data buffers for eviction progress tracking. * When evicting from ghost states return logical buffer size to make eviction * progress at the same (or at least comparable) rate as from non-ghost states. * * Return *real_evicted for actual ARC size reduction to wake up threads * waiting for it. For non-ghost states it includes size of evicted data * buffers (the headers are not freed there). For ghost states it includes * only the evicted headers size. */ static int64_t arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; uint_t min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? arc_min_prescient_prefetch_ms : arc_min_prefetch_ms; ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); *real_evicted = 0; state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { /* * l2arc_write_buffers() relies on a header's L1 portion * (i.e. its b_pabd field) during it's write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing its L1 piece) until the header is * done being written to the l2arc. */ if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { ARCSTAT_BUMP(arcstat_evict_l2_skip); return (bytes_evicted); } ARCSTAT_BUMP(arcstat_deleted); bytes_evicted += HDR_GET_LSIZE(hdr); DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); if (HDR_HAS_L2HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_pabd == NULL); ASSERT(!HDR_HAS_RABD(hdr)); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ arc_change_state(arc_l2c_only, hdr); /* * dropping from L1+L2 cached to L2-only, * realloc to remove the L1 header. */ (void) arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); *real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE; } else { arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); *real_evicted += HDR_FULL_SIZE; } return (bytes_evicted); } ASSERT(state == arc_mru || state == arc_mfu || state == arc_uncached); evicted_state = (state == arc_uncached) ? arc_anon : ((state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost); /* prefetch buffers have a minimum lifespan */ if ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < MSEC_TO_TICK(min_lifetime)) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { if (l2arc_write_eligible(hdr->b_spa, hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, HDR_GET_LSIZE(hdr)); switch (state->arcs_state) { case ARC_STATE_MRU: ARCSTAT_INCR( arcstat_evict_l2_eligible_mru, HDR_GET_LSIZE(hdr)); break; case ARC_STATE_MFU: ARCSTAT_INCR( arcstat_evict_l2_eligible_mfu, HDR_GET_LSIZE(hdr)); break; default: break; } } else { ARCSTAT_INCR(arcstat_evict_l2_ineligible, HDR_GET_LSIZE(hdr)); } } bytes_evicted += arc_hdr_size(hdr); *real_evicted += arc_hdr_size(hdr); /* * If this hdr is being evicted and has a compressed buffer then we * discard it here before we change states. This ensures that the * accounting is updated correctly in arc_free_data_impl(). */ if (hdr->b_l1hdr.b_pabd != NULL) arc_hdr_free_abd(hdr, B_FALSE); if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); arc_change_state(evicted_state, hdr); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); if (evicted_state == arc_anon) { arc_hdr_destroy(hdr); *real_evicted += HDR_FULL_SIZE; } else { ASSERT(HDR_IN_HASH_TABLE(hdr)); } return (bytes_evicted); } static void arc_set_need_free(void) { ASSERT(MUTEX_HELD(&arc_evict_lock)); int64_t remaining = arc_free_memory() - arc_sys_free / 2; arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters); if (aw == NULL) { arc_need_free = MAX(-remaining, 0); } else { arc_need_free = MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count)); } } static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, uint64_t spa, uint64_t bytes) { multilist_sublist_t *mls; uint64_t bytes_evicted = 0, real_evicted = 0; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; uint_t evict_count = zfs_arc_evict_batch_limit; ASSERT3P(marker, !=, NULL); mls = multilist_sublist_lock(ml, idx); for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL); hdr = multilist_sublist_prev(mls, marker)) { if ((evict_count == 0) || (bytes_evicted >= bytes)) break; /* * To keep our iteration location, move the marker * forward. Since we're not holding hdr's hash lock, we * must be very careful and not remove 'hdr' from the * sublist. Otherwise, other consumers might mistake the * 'hdr' as not being on a sublist when they call the * multilist_link_active() function (they all rely on * the hash lock protecting concurrent insertions and * removals). multilist_sublist_move_forward() was * specifically implemented to ensure this is the case * (only 'marker' will be removed and re-inserted). */ multilist_sublist_move_forward(mls, marker); /* * The only case where the b_spa field should ever be * zero, is the marker headers inserted by * arc_evict_state(). It's possible for multiple threads * to be calling arc_evict_state() concurrently (e.g. * dsl_pool_close() and zio_inject_fault()), so we must * skip any markers we see from these other threads. */ if (hdr->b_spa == 0) continue; /* we're only interested in evicting buffers of a certain spa */ if (spa != 0 && hdr->b_spa != spa) { ARCSTAT_BUMP(arcstat_evict_skip); continue; } hash_lock = HDR_LOCK(hdr); /* * We aren't calling this function from any code path * that would already be holding a hash lock, so we're * asserting on this assumption to be defensive in case * this ever changes. Without this check, it would be * possible to incorrectly increment arcstat_mutex_miss * below (e.g. if the code changed such that we called * this function with a hash lock held). */ ASSERT(!MUTEX_HELD(hash_lock)); if (mutex_tryenter(hash_lock)) { uint64_t revicted; uint64_t evicted = arc_evict_hdr(hdr, &revicted); mutex_exit(hash_lock); bytes_evicted += evicted; real_evicted += revicted; /* * If evicted is zero, arc_evict_hdr() must have * decided to skip this header, don't increment * evict_count in this case. */ if (evicted != 0) evict_count--; } else { ARCSTAT_BUMP(arcstat_mutex_miss); } } multilist_sublist_unlock(mls); /* * Increment the count of evicted bytes, and wake up any threads that * are waiting for the count to reach this value. Since the list is * ordered by ascending aew_count, we pop off the beginning of the * list until we reach the end, or a waiter that's past the current * "count". Doing this outside the loop reduces the number of times * we need to acquire the global arc_evict_lock. * * Only wake when there's sufficient free memory in the system * (specifically, arc_sys_free/2, which by default is a bit more than * 1/64th of RAM). See the comments in arc_wait_for_eviction(). */ mutex_enter(&arc_evict_lock); arc_evict_count += real_evicted; if (arc_free_memory() > arc_sys_free / 2) { arc_evict_waiter_t *aw; while ((aw = list_head(&arc_evict_waiters)) != NULL && aw->aew_count <= arc_evict_count) { list_remove(&arc_evict_waiters, aw); cv_broadcast(&aw->aew_cv); } } arc_set_need_free(); mutex_exit(&arc_evict_lock); /* * If the ARC size is reduced from arc_c_max to arc_c_min (especially * if the average cached block is small), eviction can be on-CPU for * many seconds. To ensure that other threads that may be bound to * this CPU are able to make progress, make a voluntary preemption * call here. */ kpreempt(KPREEMPT_SYNC); return (bytes_evicted); } /* * Allocate an array of buffer headers used as placeholders during arc state * eviction. */ static arc_buf_hdr_t ** arc_state_alloc_markers(int count) { arc_buf_hdr_t **markers; markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP); for (int i = 0; i < count; i++) { markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); /* * A b_spa of 0 is used to indicate that this header is * a marker. This fact is used in arc_evict_state_impl(). */ markers[i]->b_spa = 0; } return (markers); } static void arc_state_free_markers(arc_buf_hdr_t **markers, int count) { for (int i = 0; i < count; i++) kmem_cache_free(hdr_full_cache, markers[i]); kmem_free(markers, sizeof (*markers) * count); } /* * Evict buffers from the given arc state, until we've removed the * specified number of bytes. Move the removed buffers to the * appropriate evict state. * * This function makes a "best effort". It skips over any buffers * it can't get a hash_lock on, and so, may not catch all candidates. * It may also return without evicting as much space as requested. * * If bytes is specified using the special value ARC_EVICT_ALL, this * will evict all available (i.e. unlocked and evictable) buffers from * the given arc state; which is used by arc_flush(). */ static uint64_t arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, uint64_t bytes) { uint64_t total_evicted = 0; multilist_t *ml = &state->arcs_list[type]; int num_sublists; arc_buf_hdr_t **markers; num_sublists = multilist_get_num_sublists(ml); /* * If we've tried to evict from each sublist, made some * progress, but still have not hit the target number of bytes * to evict, we want to keep trying. The markers allow us to * pick up where we left off for each individual sublist, rather * than starting from the tail each time. */ if (zthr_iscurthread(arc_evict_zthr)) { markers = arc_state_evict_markers; ASSERT3S(num_sublists, <=, arc_state_evict_marker_count); } else { markers = arc_state_alloc_markers(num_sublists); } for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls; mls = multilist_sublist_lock(ml, i); multilist_sublist_insert_tail(mls, markers[i]); multilist_sublist_unlock(mls); } /* * While we haven't hit our target number of bytes to evict, or * we're evicting all available buffers. */ while (total_evicted < bytes) { int sublist_idx = multilist_get_random_index(ml); uint64_t scan_evicted = 0; /* * Start eviction using a randomly selected sublist, * this is to try and evenly balance eviction across all * sublists. Always starting at the same sublist * (e.g. index 0) would cause evictions to favor certain * sublists over others. */ for (int i = 0; i < num_sublists; i++) { uint64_t bytes_remaining; uint64_t bytes_evicted; if (total_evicted < bytes) bytes_remaining = bytes - total_evicted; else break; bytes_evicted = arc_evict_state_impl(ml, sublist_idx, markers[sublist_idx], spa, bytes_remaining); scan_evicted += bytes_evicted; total_evicted += bytes_evicted; /* we've reached the end, wrap to the beginning */ if (++sublist_idx >= num_sublists) sublist_idx = 0; } /* * If we didn't evict anything during this scan, we have * no reason to believe we'll evict more during another * scan, so break the loop. */ if (scan_evicted == 0) { /* This isn't possible, let's make that obvious */ ASSERT3S(bytes, !=, 0); /* * When bytes is ARC_EVICT_ALL, the only way to * break the loop is when scan_evicted is zero. * In that case, we actually have evicted enough, * so we don't want to increment the kstat. */ if (bytes != ARC_EVICT_ALL) { ASSERT3S(total_evicted, <, bytes); ARCSTAT_BUMP(arcstat_evict_not_enough); } break; } } for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls = multilist_sublist_lock(ml, i); multilist_sublist_remove(mls, markers[i]); multilist_sublist_unlock(mls); } if (markers != arc_state_evict_markers) arc_state_free_markers(markers, num_sublists); return (total_evicted); } /* * Flush all "evictable" data of the given type from the arc state * specified. This will not evict any "active" buffers (i.e. referenced). * * When 'retry' is set to B_FALSE, the function will make a single pass * over the state and evict any buffers that it can. Since it doesn't * continually retry the eviction, it might end up leaving some buffers * in the ARC due to lock misses. * * When 'retry' is set to B_TRUE, the function will continually retry the * eviction until *all* evictable buffers have been removed from the * state. As a result, if concurrent insertions into the state are * allowed (e.g. if the ARC isn't shutting down), this function might * wind up in an infinite loop, continually trying to evict buffers. */ static uint64_t arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, boolean_t retry) { uint64_t evicted = 0; while (zfs_refcount_count(&state->arcs_esize[type]) != 0) { evicted += arc_evict_state(state, type, spa, ARC_EVICT_ALL); if (!retry) break; } return (evicted); } /* * Evict the specified number of bytes from the state specified. This * function prevents us from trying to evict more from a state's list * than is "evictable", and to skip evicting altogether when passed a * negative value for "bytes". In contrast, arc_evict_state() will * evict everything it can, when passed a negative value for "bytes". */ static uint64_t arc_evict_impl(arc_state_t *state, arc_buf_contents_t type, int64_t bytes) { uint64_t delta; if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&state->arcs_esize[type]), bytes); return (arc_evict_state(state, type, 0, delta)); } return (0); } /* * Adjust specified fraction, taking into account initial ghost state(s) size, * ghost hit bytes towards increasing the fraction, ghost hit bytes towards * decreasing it, plus a balance factor, controlling the decrease rate, used * to balance metadata vs data. */ static uint64_t arc_evict_adj(uint64_t frac, uint64_t total, uint64_t up, uint64_t down, uint_t balance) { if (total < 8 || up + down == 0) return (frac); /* * We should not have more ghost hits than ghost size, but they * may get close. Restrict maximum adjustment in that case. */ if (up + down >= total / 4) { uint64_t scale = (up + down) / (total / 8); up /= scale; down /= scale; } /* Get maximal dynamic range by choosing optimal shifts. */ int s = highbit64(total); s = MIN(64 - s, 32); uint64_t ofrac = (1ULL << 32) - frac; if (frac >= 4 * ofrac) up /= frac / (2 * ofrac + 1); up = (up << s) / (total >> (32 - s)); if (ofrac >= 4 * frac) down /= ofrac / (2 * frac + 1); down = (down << s) / (total >> (32 - s)); down = down * 100 / balance; return (frac + up - down); } /* * Evict buffers from the cache, such that arcstat_size is capped by arc_c. */ static uint64_t arc_evict(void) { uint64_t asize, bytes, total_evicted = 0; int64_t e, mrud, mrum, mfud, mfum, w; static uint64_t ogrd, ogrm, ogfd, ogfm; static uint64_t gsrd, gsrm, gsfd, gsfm; uint64_t ngrd, ngrm, ngfd, ngfm; /* Get current size of ARC states we can evict from. */ mrud = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_DATA]) + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]); mrum = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]); mfud = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_DATA]); mfum = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); uint64_t d = mrud + mfud; uint64_t m = mrum + mfum; uint64_t t = d + m; /* Get ARC ghost hits since last eviction. */ ngrd = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]); uint64_t grd = ngrd - ogrd; ogrd = ngrd; ngrm = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]); uint64_t grm = ngrm - ogrm; ogrm = ngrm; ngfd = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]); uint64_t gfd = ngfd - ogfd; ogfd = ngfd; ngfm = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]); uint64_t gfm = ngfm - ogfm; ogfm = ngfm; /* Adjust ARC states balance based on ghost hits. */ arc_meta = arc_evict_adj(arc_meta, gsrd + gsrm + gsfd + gsfm, grm + gfm, grd + gfd, zfs_arc_meta_balance); arc_pd = arc_evict_adj(arc_pd, gsrd + gsfd, grd, gfd, 100); arc_pm = arc_evict_adj(arc_pm, gsrm + gsfm, grm, gfm, 100); asize = aggsum_value(&arc_sums.arcstat_size); int64_t wt = t - (asize - arc_c); /* * Try to reduce pinned dnodes if more than 3/4 of wanted metadata * target is not evictable or if they go over arc_dnode_limit. */ int64_t prune = 0; int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size); w = wt * (int64_t)(arc_meta >> 16) >> 16; if (zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) - zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]) > w * 3 / 4) { prune = dn / sizeof (dnode_t) * zfs_arc_dnode_reduce_percent / 100; } else if (dn > arc_dnode_limit) { prune = (dn - arc_dnode_limit) / sizeof (dnode_t) * zfs_arc_dnode_reduce_percent / 100; } if (prune > 0) arc_prune_async(prune); /* Evict MRU metadata. */ w = wt * (int64_t)(arc_meta * arc_pm >> 48) >> 16; e = MIN((int64_t)(asize - arc_c), (int64_t)(mrum - w)); bytes = arc_evict_impl(arc_mru, ARC_BUFC_METADATA, e); total_evicted += bytes; mrum -= bytes; asize -= bytes; /* Evict MFU metadata. */ w = wt * (int64_t)(arc_meta >> 16) >> 16; e = MIN((int64_t)(asize - arc_c), (int64_t)(m - w)); bytes = arc_evict_impl(arc_mfu, ARC_BUFC_METADATA, e); total_evicted += bytes; mfum -= bytes; asize -= bytes; /* Evict MRU data. */ wt -= m - total_evicted; w = wt * (int64_t)(arc_pd >> 16) >> 16; e = MIN((int64_t)(asize - arc_c), (int64_t)(mrud - w)); bytes = arc_evict_impl(arc_mru, ARC_BUFC_DATA, e); total_evicted += bytes; mrud -= bytes; asize -= bytes; /* Evict MFU data. */ e = asize - arc_c; bytes = arc_evict_impl(arc_mfu, ARC_BUFC_DATA, e); mfud -= bytes; total_evicted += bytes; /* * Evict ghost lists * * Size of each state's ghost list represents how much that state * may grow by shrinking the other states. Would it need to shrink * other states to zero (that is unlikely), its ghost size would be * equal to sum of other three state sizes. But excessive ghost * size may result in false ghost hits (too far back), that may * never result in real cache hits if several states are competing. * So choose some arbitraty point of 1/2 of other state sizes. */ gsrd = (mrum + mfud + mfum) / 2; e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]) - gsrd; (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_DATA, e); gsrm = (mrud + mfud + mfum) / 2; e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]) - gsrm; (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_METADATA, e); gsfd = (mrud + mrum + mfum) / 2; e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]) - gsfd; (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_DATA, e); gsfm = (mrud + mrum + mfud) / 2; e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]) - gsfm; (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_METADATA, e); return (total_evicted); } void arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; /* * If retry is B_TRUE, a spa must not be specified since we have * no good way to determine if all of a spa's buffers have been * evicted from an arc state. */ ASSERT(!retry || spa == NULL); if (spa != NULL) guid = spa_load_guid(spa); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_METADATA, retry); } void arc_reduce_target_size(int64_t to_free) { uint64_t c = arc_c; if (c <= arc_c_min) return; /* * All callers want the ARC to actually evict (at least) this much * memory. Therefore we reduce from the lower of the current size and * the target size. This way, even if arc_c is much higher than * arc_size (as can be the case after many calls to arc_freed(), we will * immediately have arc_c < arc_size and therefore the arc_evict_zthr * will evict. */ uint64_t asize = aggsum_value(&arc_sums.arcstat_size); if (asize < c) to_free += c - asize; arc_c = MAX((int64_t)c - to_free, (int64_t)arc_c_min); /* See comment in arc_evict_cb_check() on why lock+flag */ mutex_enter(&arc_evict_lock); arc_evict_needed = B_TRUE; mutex_exit(&arc_evict_lock); zthr_wakeup(arc_evict_zthr); } /* * Determine if the system is under memory pressure and is asking * to reclaim memory. A return value of B_TRUE indicates that the system * is under memory pressure and that the arc should adjust accordingly. */ boolean_t arc_reclaim_needed(void) { return (arc_available_memory() < 0); } void arc_kmem_reap_soon(void) { size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; #ifdef _KERNEL #if defined(_ILP32) /* * Reclaim unused memory from all kmem caches. */ kmem_reap(); #endif #endif for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { #if defined(_ILP32) /* reach upper limit of cache size on 32-bit */ if (zio_buf_cache[i] == NULL) break; #endif if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; kmem_cache_reap_now(zio_buf_cache[i]); } if (zio_data_buf_cache[i] != prev_data_cache) { prev_data_cache = zio_data_buf_cache[i]; kmem_cache_reap_now(zio_data_buf_cache[i]); } } kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); kmem_cache_reap_now(zfs_btree_leaf_cache); abd_cache_reap_now(); } static boolean_t arc_evict_cb_check(void *arg, zthr_t *zthr) { (void) arg, (void) zthr; #ifdef ZFS_DEBUG /* * This is necessary in order to keep the kstat information * up to date for tools that display kstat data such as the * mdb ::arc dcmd and the Linux crash utility. These tools * typically do not call kstat's update function, but simply * dump out stats from the most recent update. Without * this call, these commands may show stale stats for the * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even * with this call, the data might be out of date if the * evict thread hasn't been woken recently; but that should * suffice. The arc_state_t structures can be queried * directly if more accurate information is needed. */ if (arc_ksp != NULL) arc_ksp->ks_update(arc_ksp, KSTAT_READ); #endif /* * We have to rely on arc_wait_for_eviction() to tell us when to * evict, rather than checking if we are overflowing here, so that we * are sure to not leave arc_wait_for_eviction() waiting on aew_cv. * If we have become "not overflowing" since arc_wait_for_eviction() * checked, we need to wake it up. We could broadcast the CV here, * but arc_wait_for_eviction() may have not yet gone to sleep. We * would need to use a mutex to ensure that this function doesn't * broadcast until arc_wait_for_eviction() has gone to sleep (e.g. * the arc_evict_lock). However, the lock ordering of such a lock * would necessarily be incorrect with respect to the zthr_lock, * which is held before this function is called, and is held by * arc_wait_for_eviction() when it calls zthr_wakeup(). */ if (arc_evict_needed) return (B_TRUE); /* * If we have buffers in uncached state, evict them periodically. */ return ((zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_DATA]) + zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]) && ddi_get_lbolt() - arc_last_uncached_flush > MSEC_TO_TICK(arc_min_prefetch_ms / 2))); } /* * Keep arc_size under arc_c by running arc_evict which evicts data * from the ARC. */ static void arc_evict_cb(void *arg, zthr_t *zthr) { (void) arg, (void) zthr; uint64_t evicted = 0; fstrans_cookie_t cookie = spl_fstrans_mark(); /* Always try to evict from uncached state. */ arc_last_uncached_flush = ddi_get_lbolt(); evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_DATA, B_FALSE); evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_METADATA, B_FALSE); /* Evict from other states only if told to. */ if (arc_evict_needed) evicted += arc_evict(); /* * If evicted is zero, we couldn't evict anything * via arc_evict(). This could be due to hash lock * collisions, but more likely due to the majority of * arc buffers being unevictable. Therefore, even if * arc_size is above arc_c, another pass is unlikely to * be helpful and could potentially cause us to enter an * infinite loop. Additionally, zthr_iscancelled() is * checked here so that if the arc is shutting down, the * broadcast will wake any remaining arc evict waiters. */ mutex_enter(&arc_evict_lock); arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) && evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0; if (!arc_evict_needed) { /* * We're either no longer overflowing, or we * can't evict anything more, so we should wake * arc_get_data_impl() sooner. */ arc_evict_waiter_t *aw; while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) { cv_broadcast(&aw->aew_cv); } arc_set_need_free(); } mutex_exit(&arc_evict_lock); spl_fstrans_unmark(cookie); } static boolean_t arc_reap_cb_check(void *arg, zthr_t *zthr) { (void) arg, (void) zthr; int64_t free_memory = arc_available_memory(); static int reap_cb_check_counter = 0; /* * If a kmem reap is already active, don't schedule more. We must * check for this because kmem_cache_reap_soon() won't actually * block on the cache being reaped (this is to prevent callers from * becoming implicitly blocked by a system-wide kmem reap -- which, * on a system with many, many full magazines, can take minutes). */ if (!kmem_cache_reap_active() && free_memory < 0) { arc_no_grow = B_TRUE; arc_warm = B_TRUE; /* * Wait at least zfs_grow_retry (default 5) seconds * before considering growing. */ arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); return (B_TRUE); } else if (free_memory < arc_c >> arc_no_grow_shift) { arc_no_grow = B_TRUE; } else if (gethrtime() >= arc_growtime) { arc_no_grow = B_FALSE; } /* * Called unconditionally every 60 seconds to reclaim unused * zstd compression and decompression context. This is done * here to avoid the need for an independent thread. */ if (!((reap_cb_check_counter++) % 60)) zfs_zstd_cache_reap_now(); return (B_FALSE); } /* * Keep enough free memory in the system by reaping the ARC's kmem * caches. To cause more slabs to be reapable, we may reduce the * target size of the cache (arc_c), causing the arc_evict_cb() * to free more buffers. */ static void arc_reap_cb(void *arg, zthr_t *zthr) { (void) arg, (void) zthr; int64_t free_memory; fstrans_cookie_t cookie = spl_fstrans_mark(); /* * Kick off asynchronous kmem_reap()'s of all our caches. */ arc_kmem_reap_soon(); /* * Wait at least arc_kmem_cache_reap_retry_ms between * arc_kmem_reap_soon() calls. Without this check it is possible to * end up in a situation where we spend lots of time reaping * caches, while we're near arc_c_min. Waiting here also gives the * subsequent free memory check a chance of finding that the * asynchronous reap has already freed enough memory, and we don't * need to call arc_reduce_target_size(). */ delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000); /* * Reduce the target size as needed to maintain the amount of free * memory in the system at a fraction of the arc_size (1/128th by * default). If oversubscribed (free_memory < 0) then reduce the * target arc_size by the deficit amount plus the fractional * amount. If free memory is positive but less than the fractional * amount, reduce by what is needed to hit the fractional amount. */ free_memory = arc_available_memory(); int64_t can_free = arc_c - arc_c_min; if (can_free > 0) { int64_t to_free = (can_free >> arc_shrink_shift) - free_memory; if (to_free > 0) arc_reduce_target_size(to_free); } spl_fstrans_unmark(cookie); } #ifdef _KERNEL /* * Determine the amount of memory eligible for eviction contained in the * ARC. All clean data reported by the ghost lists can always be safely * evicted. Due to arc_c_min, the same does not hold for all clean data * contained by the regular mru and mfu lists. * * In the case of the regular mru and mfu lists, we need to report as * much clean data as possible, such that evicting that same reported * data will not bring arc_size below arc_c_min. Thus, in certain * circumstances, the total amount of clean data in the mru and mfu * lists might not actually be evictable. * * The following two distinct cases are accounted for: * * 1. The sum of the amount of dirty data contained by both the mru and * mfu lists, plus the ARC's other accounting (e.g. the anon list), * is greater than or equal to arc_c_min. * (i.e. amount of dirty data >= arc_c_min) * * This is the easy case; all clean data contained by the mru and mfu * lists is evictable. Evicting all clean data can only drop arc_size * to the amount of dirty data, which is greater than arc_c_min. * * 2. The sum of the amount of dirty data contained by both the mru and * mfu lists, plus the ARC's other accounting (e.g. the anon list), * is less than arc_c_min. * (i.e. arc_c_min > amount of dirty data) * * 2.1. arc_size is greater than or equal arc_c_min. * (i.e. arc_size >= arc_c_min > amount of dirty data) * * In this case, not all clean data from the regular mru and mfu * lists is actually evictable; we must leave enough clean data * to keep arc_size above arc_c_min. Thus, the maximum amount of * evictable data from the two lists combined, is exactly the * difference between arc_size and arc_c_min. * * 2.2. arc_size is less than arc_c_min * (i.e. arc_c_min > arc_size > amount of dirty data) * * In this case, none of the data contained in the mru and mfu * lists is evictable, even if it's clean. Since arc_size is * already below arc_c_min, evicting any more would only * increase this negative difference. */ #endif /* _KERNEL */ /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are coming from. This function is only called * when we are adding new content to the cache. */ static void arc_adapt(uint64_t bytes) { /* * Wake reap thread if we do not have any available memory */ if (arc_reclaim_needed()) { zthr_wakeup(arc_reap_zthr); return; } if (arc_no_grow) return; if (arc_c >= arc_c_max) return; /* * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ if (aggsum_upper_bound(&arc_sums.arcstat_size) + 2 * SPA_MAXBLOCKSIZE >= arc_c) { uint64_t dc = MAX(bytes, SPA_OLD_MAXBLOCKSIZE); if (atomic_add_64_nv(&arc_c, dc) > arc_c_max) arc_c = arc_c_max; } } /* * Check if arc_size has grown past our upper threshold, determined by * zfs_arc_overflow_shift. */ static arc_ovf_level_t arc_is_overflowing(boolean_t use_reserve) { /* Always allow at least one block of overflow */ int64_t overflow = MAX(SPA_MAXBLOCKSIZE, arc_c >> zfs_arc_overflow_shift); /* * We just compare the lower bound here for performance reasons. Our * primary goals are to make sure that the arc never grows without * bound, and that it can reach its maximum size. This check * accomplishes both goals. The maximum amount we could run over by is * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block * in the ARC. In practice, that's in the tens of MB, which is low * enough to be safe. */ int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - overflow / 2; if (!use_reserve) overflow /= 2; return (over < 0 ? ARC_OVF_NONE : over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); } static abd_t * arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, int alloc_flags) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag, alloc_flags); if (alloc_flags & ARC_HDR_ALLOC_LINEAR) return (abd_alloc_linear(size, type == ARC_BUFC_METADATA)); else return (abd_alloc(size, type == ARC_BUFC_METADATA)); } static void * arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag, 0); if (type == ARC_BUFC_METADATA) { return (zio_buf_alloc(size)); } else { ASSERT(type == ARC_BUFC_DATA); return (zio_data_buf_alloc(size)); } } /* * Wait for the specified amount of data (in bytes) to be evicted from the * ARC, and for there to be sufficient free memory in the system. Waiting for * eviction ensures that the memory used by the ARC decreases. Waiting for * free memory ensures that the system won't run out of free pages, regardless * of ARC behavior and settings. See arc_lowmem_init(). */ void arc_wait_for_eviction(uint64_t amount, boolean_t use_reserve) { switch (arc_is_overflowing(use_reserve)) { case ARC_OVF_NONE: return; case ARC_OVF_SOME: /* * This is a bit racy without taking arc_evict_lock, but the * worst that can happen is we either call zthr_wakeup() extra * time due to race with other thread here, or the set flag * get cleared by arc_evict_cb(), which is unlikely due to * big hysteresis, but also not important since at this level * of overflow the eviction is purely advisory. Same time * taking the global lock here every time without waiting for * the actual eviction creates a significant lock contention. */ if (!arc_evict_needed) { arc_evict_needed = B_TRUE; zthr_wakeup(arc_evict_zthr); } return; case ARC_OVF_SEVERE: default: { arc_evict_waiter_t aw; list_link_init(&aw.aew_node); cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL); uint64_t last_count = 0; mutex_enter(&arc_evict_lock); if (!list_is_empty(&arc_evict_waiters)) { arc_evict_waiter_t *last = list_tail(&arc_evict_waiters); last_count = last->aew_count; } else if (!arc_evict_needed) { arc_evict_needed = B_TRUE; zthr_wakeup(arc_evict_zthr); } /* * Note, the last waiter's count may be less than * arc_evict_count if we are low on memory in which * case arc_evict_state_impl() may have deferred * wakeups (but still incremented arc_evict_count). */ aw.aew_count = MAX(last_count, arc_evict_count) + amount; list_insert_tail(&arc_evict_waiters, &aw); arc_set_need_free(); DTRACE_PROBE3(arc__wait__for__eviction, uint64_t, amount, uint64_t, arc_evict_count, uint64_t, aw.aew_count); /* * We will be woken up either when arc_evict_count reaches * aew_count, or when the ARC is no longer overflowing and * eviction completes. * In case of "false" wakeup, we will still be on the list. */ do { cv_wait(&aw.aew_cv, &arc_evict_lock); } while (list_link_active(&aw.aew_node)); mutex_exit(&arc_evict_lock); cv_destroy(&aw.aew_cv); } } } /* * Allocate a block and return it to the caller. If we are hitting the * hard limit for the cache size, we must sleep, waiting for the eviction * thread to catch up. If we're past the target size but below the hard * limit, we'll only signal the reclaim thread and continue on. */ static void arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, int alloc_flags) { arc_adapt(size); /* * If arc_size is currently overflowing, we must be adding data * faster than we are evicting. To ensure we don't compound the * problem by adding more data and forcing arc_size to grow even * further past it's target size, we wait for the eviction thread to * make some progress. We also wait for there to be sufficient free * memory in the system, as measured by arc_free_memory(). * * Specifically, we wait for zfs_arc_eviction_pct percent of the * requested size to be evicted. This should be more than 100%, to * ensure that that progress is also made towards getting arc_size * under arc_c. See the comment above zfs_arc_eviction_pct. */ arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100, alloc_flags & ARC_HDR_USE_RESERVE); arc_buf_contents_t type = arc_buf_type(hdr); if (type == ARC_BUFC_METADATA) { arc_space_consume(size, ARC_SPACE_META); } else { arc_space_consume(size, ARC_SPACE_DATA); } /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ arc_state_t *state = hdr->b_l1hdr.b_state; if (!GHOST_STATE(state)) { (void) zfs_refcount_add_many(&state->arcs_size[type], size, tag); /* * If this is reached via arc_read, the link is * protected by the hash lock. If reached via * arc_buf_alloc, the header should not be accessed by * any other thread. And, if reached via arc_read_done, * the hash lock will protect it if it's found in the * hash table; otherwise no other thread should be * trying to [add|remove]_reference it. */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); (void) zfs_refcount_add_many(&state->arcs_esize[type], size, tag); } } } static void arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, const void *tag) { arc_free_data_impl(hdr, size, tag); abd_free(abd); } static void arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, const void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_free_data_impl(hdr, size, tag); if (type == ARC_BUFC_METADATA) { zio_buf_free(buf, size); } else { ASSERT(type == ARC_BUFC_DATA); zio_data_buf_free(buf, size); } } /* * Free the arc data buffer. */ static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, tag); } (void) zfs_refcount_remove_many(&state->arcs_size[type], size, tag); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_space_return(size, ARC_SPACE_DATA); } } /* * This routine is called whenever a buffer is accessed. */ static void arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit) { ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(HDR_HAS_L1HDR(hdr)); /* * Update buffer prefetch status. */ boolean_t was_prefetch = HDR_PREFETCH(hdr); boolean_t now_prefetch = arc_flags & ARC_FLAG_PREFETCH; if (was_prefetch != now_prefetch) { if (was_prefetch) { ARCSTAT_CONDSTAT(hit, demand_hit, demand_iohit, HDR_PRESCIENT_PREFETCH(hdr), prescient, predictive, prefetch); } if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_decrement_state(hdr); if (was_prefetch) { arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); } else { arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); } if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_increment_state(hdr); } if (now_prefetch) { if (arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) { arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); ARCSTAT_BUMP(arcstat_prescient_prefetch); } else { ARCSTAT_BUMP(arcstat_predictive_prefetch); } } if (arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); clock_t now = ddi_get_lbolt(); if (hdr->b_l1hdr.b_state == arc_anon) { arc_state_t *new_state; /* * This buffer is not in the cache, and does not appear in * our "ghost" lists. Add it to the MRU or uncached state. */ ASSERT0(hdr->b_l1hdr.b_arc_access); hdr->b_l1hdr.b_arc_access = now; if (HDR_UNCACHED(hdr)) { new_state = arc_uncached; DTRACE_PROBE1(new_state__uncached, arc_buf_hdr_t *, hdr); } else { new_state = arc_mru; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } arc_change_state(new_state, hdr); } else if (hdr->b_l1hdr.b_state == arc_mru) { /* * This buffer has been accessed once recently and either * its read is still in progress or it is in the cache. */ if (HDR_IO_IN_PROGRESS(hdr)) { hdr->b_l1hdr.b_arc_access = now; return; } hdr->b_l1hdr.b_mru_hits++; ARCSTAT_BUMP(arcstat_mru_hits); /* * If the previous access was a prefetch, then it already * handled possible promotion, so nothing more to do for now. */ if (was_prefetch) { hdr->b_l1hdr.b_arc_access = now; return; } /* * If more than ARC_MINTIME have passed from the previous * hit, promote the buffer to the MFU state. */ if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access + ARC_MINTIME)) { hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr); } } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been accessed once recently, but was * evicted from the cache. Would we have bigger MRU, it * would be an MRU hit, so handle it the same way, except * we don't need to check the previous access time. */ hdr->b_l1hdr.b_mru_ghost_hits++; ARCSTAT_BUMP(arcstat_mru_ghost_hits); hdr->b_l1hdr.b_arc_access = now; wmsum_add(&arc_mru_ghost->arcs_hits[arc_buf_type(hdr)], arc_hdr_size(hdr)); if (was_prefetch) { new_state = arc_mru; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } arc_change_state(new_state, hdr); } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* * This buffer has been accessed more than once and either * still in the cache or being restored from one of ghosts. */ if (!HDR_IO_IN_PROGRESS(hdr)) { hdr->b_l1hdr.b_mfu_hits++; ARCSTAT_BUMP(arcstat_mfu_hits); } hdr->b_l1hdr.b_arc_access = now; } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { /* * This buffer has been accessed more than once recently, but * has been evicted from the cache. Would we have bigger MFU * it would stay in cache, so move it back to MFU state. */ hdr->b_l1hdr.b_mfu_ghost_hits++; ARCSTAT_BUMP(arcstat_mfu_ghost_hits); hdr->b_l1hdr.b_arc_access = now; wmsum_add(&arc_mfu_ghost->arcs_hits[arc_buf_type(hdr)], arc_hdr_size(hdr)); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr); } else if (hdr->b_l1hdr.b_state == arc_uncached) { /* * This buffer is uncacheable, but we got a hit. Probably * a demand read after prefetch. Nothing more to do here. */ if (!HDR_IO_IN_PROGRESS(hdr)) ARCSTAT_BUMP(arcstat_uncached_hits); hdr->b_l1hdr.b_arc_access = now; } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC and was not accessed * for a long time, so treat it as new and put into MRU. */ hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); arc_change_state(arc_mru, hdr); } else { cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_l1hdr.b_state); } } /* * This routine is called by dbuf_hold() to update the arc_access() state * which otherwise would be skipped for entries in the dbuf cache. */ void arc_buf_access(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * Avoid taking the hash_lock when possible as an optimization. * The header must be checked again under the hash_lock in order * to handle the case where it is concurrently being released. */ if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) return; kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_access_skip); return; } ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu || hdr->b_l1hdr.b_state == arc_uncached); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, 0, B_TRUE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(B_TRUE /* demand */, demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } /* a generic arc_read_done_func_t which you can use */ void arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { (void) zio, (void) zb, (void) bp; if (buf == NULL) return; memcpy(arg, buf->b_data, arc_buf_size(buf)); arc_buf_destroy(buf, arg); } /* a generic arc_read_done_func_t */ void arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { (void) zb, (void) bp; arc_buf_t **bufp = arg; if (buf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); *bufp = NULL; } else { ASSERT(zio == NULL || zio->io_error == 0); *bufp = buf; ASSERT(buf->b_data != NULL); } } static void arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) { if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF); } else { if (HDR_COMPRESSION_ENABLED(hdr)) { ASSERT3U(arc_hdr_get_compress(hdr), ==, BP_GET_COMPRESS(bp)); } ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp)); } } static void arc_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; arc_buf_hdr_t *hdr = zio->io_private; kmutex_t *hash_lock = NULL; arc_callback_t *callback_list; arc_callback_t *acb; /* * The hdr was inserted into hash-table and removed from lists * prior to starting I/O. We should find this header, since * it's in the hash table, and it should be legit since it's * not possible to evict it during the I/O. The only possible * reason for it not to be found is if we were freed during the * read. */ if (HDR_IN_HASH_TABLE(hdr)) { arc_buf_hdr_t *found; ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); ASSERT3U(hdr->b_dva.dva_word[0], ==, BP_IDENTITY(zio->io_bp)->dva_word[0]); ASSERT3U(hdr->b_dva.dva_word[1], ==, BP_IDENTITY(zio->io_bp)->dva_word[1]); found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); ASSERT((found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); ASSERT3P(hash_lock, !=, NULL); } if (BP_IS_PROTECTED(bp)) { hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv); if (zio->io_error == 0) { if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) { void *tmpbuf; tmpbuf = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); zio_crypt_decode_mac_zil(tmpbuf, hdr->b_crypt_hdr.b_mac); abd_return_buf(zio->io_abd, tmpbuf, sizeof (zil_chain_t)); } else { zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); } } } if (zio->io_error == 0) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { if (BP_GET_LEVEL(zio->io_bp) > 0) { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; } else { hdr->b_l1hdr.b_byteswap = DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); } } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } if (!HDR_L2_READING(hdr)) { hdr->b_complevel = zio->io_prop.zp_complevel; } } arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); if (l2arc_noprefetch && HDR_PREFETCH(hdr)) arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); hdr->b_l1hdr.b_acb = NULL; /* * If a read request has a callback (i.e. acb_done is not NULL), then we * make a buf containing the data according to the parameters which were * passed in. The implementation of arc_buf_alloc_impl() ensures that we * aren't needlessly decompressing the data multiple times. */ int callback_cnt = 0; for (acb = callback_list; acb != NULL; acb = acb->acb_next) { /* We need the last one to call below in original order. */ callback_list = acb; if (!acb->acb_done || acb->acb_nobuf) continue; callback_cnt++; if (zio->io_error != 0) continue; int error = arc_buf_alloc_impl(hdr, zio->io_spa, &acb->acb_zb, acb->acb_private, acb->acb_encrypted, acb->acb_compressed, acb->acb_noauth, B_TRUE, &acb->acb_buf); /* * Assert non-speculative zios didn't fail because an * encryption key wasn't loaded */ ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) || error != EACCES); /* * If we failed to decrypt, report an error now (as the zio * layer would have done if it had done the transforms). */ if (error == ECKSUM) { ASSERT(BP_IS_PROTECTED(bp)); error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(zio->io_spa, &acb->acb_zb, &zio->io_bp->blk_birth); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, zio->io_spa, NULL, &acb->acb_zb, zio, 0); } } if (error != 0) { /* * Decompression or decryption failed. Set * io_error so that when we call acb_done * (below), we will indicate that the read * failed. Note that in the unusual case * where one callback is compressed and another * uncompressed, we will mark all of them * as failed, even though the uncompressed * one can't actually fail. In this case, * the hdr will not be anonymous, because * if there are multiple callbacks, it's * because multiple threads found the same * arc buf in the hash table. */ zio->io_error = error; } } /* * If there are multiple callbacks, we must have the hash lock, * because the only way for multiple threads to find this hdr is * in the hash table. This ensures that if there are multiple * callbacks, the hdr is not anonymous. If it were anonymous, * we couldn't use arc_buf_destroy() in the error case below. */ ASSERT(callback_cnt < 2 || hash_lock != NULL); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); } - /* - * Broadcast before we drop the hash_lock to avoid the possibility - * that the hdr (and hence the cv) might be freed before we get to - * the cv_broadcast(). - */ - cv_broadcast(&hdr->b_l1hdr.b_cv); - arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); (void) remove_reference(hdr, hdr); if (hash_lock != NULL) mutex_exit(hash_lock); /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { if (acb->acb_done != NULL) { if (zio->io_error != 0 && acb->acb_buf != NULL) { /* * If arc_buf_alloc_impl() fails during * decompression, the buf will still be * allocated, and needs to be freed here. */ arc_buf_destroy(acb->acb_buf, acb->acb_private); acb->acb_buf = NULL; } acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, acb->acb_buf, acb->acb_private); } if (acb->acb_zio_dummy != NULL) { acb->acb_zio_dummy->io_error = zio->io_error; zio_nowait(acb->acb_zio_dummy); } callback_list = acb->acb_prev; if (acb->acb_wait) { mutex_enter(&acb->acb_wait_lock); acb->acb_wait_error = zio->io_error; acb->acb_wait = B_FALSE; cv_signal(&acb->acb_wait_cv); mutex_exit(&acb->acb_wait_lock); /* acb will be freed by the waiting thread. */ } else { kmem_free(acb, sizeof (arc_callback_t)); } } } /* * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was * required. If the block is not in the cache pass the read request * on to the spa with a substitute callback function, so that the * requested block will be added to the cache. * * If a read request arrives for a block that has a read in-progress, * either wait for the in-progress read to complete (and return the * results); or, if this is a read with a "done" func, add a record * to the read to invoke the "done" func when the read completes, * and return; or just return. * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0; boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) && (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) && (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp); boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF; arc_buf_t *buf = NULL; int rc = 0; ASSERT(!embedded_bp || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); ASSERT(!BP_IS_HOLE(bp)); ASSERT(!BP_IS_REDACTED(bp)); /* * Normally SPL_FSTRANS will already be set since kernel threads which * expect to call the DMU interfaces will set it when created. System * calls are similarly handled by setting/cleaning the bit in the * registered callback (module/os/.../zfs/zpl_*). * * External consumers such as Lustre which call the exported DMU * interfaces may not have set SPL_FSTRANS. To avoid a deadlock * on the hash_lock always set and clear the bit. */ fstrans_cookie_t cookie = spl_fstrans_mark(); top: /* * Verify the block pointer contents are reasonable. This should * always be the case since the blkptr is protected by a checksum. * However, if there is damage it's desirable to detect this early * and treat it as a checksum error. This allows an alternate blkptr * to be tried when one is available (e.g. ditto blocks). */ if (!zfs_blkptr_verify(spa, bp, (zio_flags & ZIO_FLAG_CONFIG_WRITER) ? BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { rc = SET_ERROR(ECKSUM); goto done; } if (!embedded_bp) { /* * Embedded BP's have no DVA and require no I/O to "read". * Create an anonymous arc buf to back it. */ hdr = buf_hash_find(guid, bp, &hash_lock); } /* * Determine if we have an L1 cache hit or a cache miss. For simplicity * we maintain encrypted data separately from compressed / uncompressed * data. If the user is requesting raw encrypted data and we don't have * that in the header we will read from disk to guarantee that we can * get it even if the encryption keys aren't loaded. */ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) || (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) { boolean_t is_data = !HDR_ISTYPE_METADATA(hdr); if (HDR_IO_IN_PROGRESS(hdr)) { if (*arc_flags & ARC_FLAG_CACHED_ONLY) { mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_cached_only_in_progress); rc = SET_ERROR(ENOENT); goto done; } zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; ASSERT3P(head_zio, !=, NULL); if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { /* * This is a sync read that needs to wait for * an in-flight async read. Request that the * zio have its priority upgraded. */ zio_change_priority(head_zio, priority); DTRACE_PROBE1(arc__async__upgrade__sync, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_async_upgrade_sync); } DTRACE_PROBE1(arc__iohit, arc_buf_hdr_t *, hdr); arc_access(hdr, *arc_flags, B_FALSE); /* * If there are multiple threads reading the same block * and that block is not yet in the ARC, then only one * thread will do the physical I/O and all other * threads will wait until that I/O completes. * Synchronous reads use the acb_wait_cv whereas nowait * reads register a callback. Both are signalled/called * in arc_read_done. * * Errors of the physical I/O may need to be propagated. * Synchronous read errors are returned here from * arc_read_done via acb_wait_error. Nowait reads * attach the acb_zio_dummy zio to pio and * arc_read_done propagates the physical I/O's io_error * to acb_zio_dummy, and thereby to pio. */ arc_callback_t *acb = NULL; if (done || pio || *arc_flags & ARC_FLAG_WAIT) { acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; acb->acb_nobuf = no_buf; if (*arc_flags & ARC_FLAG_WAIT) { acb->acb_wait = B_TRUE; mutex_init(&acb->acb_wait_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT, NULL); } acb->acb_zb = *zb; if (pio != NULL) { acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); } acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; - if (hdr->b_l1hdr.b_acb) - hdr->b_l1hdr.b_acb->acb_prev = acb; + hdr->b_l1hdr.b_acb->acb_prev = acb; hdr->b_l1hdr.b_acb = acb; } mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_iohits); ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, is_data, data, metadata, iohits); if (*arc_flags & ARC_FLAG_WAIT) { mutex_enter(&acb->acb_wait_lock); while (acb->acb_wait) { cv_wait(&acb->acb_wait_cv, &acb->acb_wait_lock); } rc = acb->acb_wait_error; mutex_exit(&acb->acb_wait_lock); mutex_destroy(&acb->acb_wait_lock); cv_destroy(&acb->acb_wait_cv); kmem_free(acb, sizeof (arc_callback_t)); } goto out; } ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu || hdr->b_l1hdr.b_state == arc_uncached); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, *arc_flags, B_TRUE); if (done && !no_buf) { ASSERT(!embedded_bp || !BP_IS_HOLE(bp)); /* Get a buf with the desired data in it. */ rc = arc_buf_alloc_impl(hdr, spa, zb, private, encrypted_read, compressed_read, noauth_read, B_TRUE, &buf); if (rc == ECKSUM) { /* * Convert authentication and decryption errors * to EIO (and generate an ereport if needed) * before leaving the ARC. */ rc = SET_ERROR(EIO); if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, zb, &hdr->b_birth); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0); } } if (rc != 0) { arc_buf_destroy_impl(buf); buf = NULL; (void) remove_reference(hdr, private); } /* assert any errors weren't due to unloaded keys */ ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc != EACCES); } mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, is_data, data, metadata, hits); *arc_flags |= ARC_FLAG_CACHED; goto done; } else { uint64_t lsize = BP_GET_LSIZE(bp); uint64_t psize = BP_GET_PSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; uint64_t size; abd_t *hdr_abd; int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); if (*arc_flags & ARC_FLAG_CACHED_ONLY) { if (hash_lock != NULL) mutex_exit(hash_lock); rc = SET_ERROR(ENOENT); goto done; } if (hdr == NULL) { /* * This block is not in the cache or it has * embedded data. */ arc_buf_hdr_t *exists = NULL; hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type); if (!embedded_bp) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } } else { /* * This block is in the ghost cache or encrypted data * was requested and we didn't have it. If it was * L2-only (and thus didn't have an L1 hdr), * we realloc the header to add an L1 hdr. */ if (!HDR_HAS_L1HDR(hdr)) { hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, hdr_full_cache); } if (GHOST_STATE(hdr->b_l1hdr.b_state)) { ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT0(zfs_refcount_count( &hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif } else if (HDR_IO_IN_PROGRESS(hdr)) { /* * If this header already had an IO in progress * and we are performing another IO to fetch * encrypted data we must wait until the first * IO completes so as not to confuse * arc_read_done(). This should be very rare * and so the performance impact shouldn't * matter. */ - cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); + arc_callback_t *acb = kmem_zalloc( + sizeof (arc_callback_t), KM_SLEEP); + acb->acb_wait = B_TRUE; + mutex_init(&acb->acb_wait_lock, NULL, + MUTEX_DEFAULT, NULL); + cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT, + NULL); + acb->acb_zio_head = + hdr->b_l1hdr.b_acb->acb_zio_head; + acb->acb_next = hdr->b_l1hdr.b_acb; + hdr->b_l1hdr.b_acb->acb_prev = acb; + hdr->b_l1hdr.b_acb = acb; mutex_exit(hash_lock); + mutex_enter(&acb->acb_wait_lock); + while (acb->acb_wait) { + cv_wait(&acb->acb_wait_cv, + &acb->acb_wait_lock); + } + mutex_exit(&acb->acb_wait_lock); + mutex_destroy(&acb->acb_wait_lock); + cv_destroy(&acb->acb_wait_cv); + kmem_free(acb, sizeof (arc_callback_t)); goto top; } } if (*arc_flags & ARC_FLAG_UNCACHED) { arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED); if (!encrypted_read) alloc_flags |= ARC_HDR_ALLOC_LINEAR; } /* * Take additional reference for IO_IN_PROGRESS. It stops * arc_access() from putting this header without any buffers * and so other references but obviously nonevictable onto * the evictable list of MRU or MFU state. */ add_reference(hdr, hdr); if (!embedded_bp) arc_access(hdr, *arc_flags, B_FALSE); arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); arc_hdr_alloc_abd(hdr, alloc_flags); if (encrypted_read) { ASSERT(HDR_HAS_RABD(hdr)); size = HDR_GET_PSIZE(hdr); hdr_abd = hdr->b_crypt_hdr.b_rabd; zio_flags |= ZIO_FLAG_RAW; } else { ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); size = arc_hdr_size(hdr); hdr_abd = hdr->b_l1hdr.b_pabd; if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { zio_flags |= ZIO_FLAG_RAW_COMPRESS; } /* * For authenticated bp's, we do not ask the ZIO layer * to authenticate them since this will cause the entire * IO to fail if the key isn't loaded. Instead, we * defer authentication until arc_buf_fill(), which will * verify the data when the key is available. */ if (BP_IS_AUTHENTICATED(bp)) zio_flags |= ZIO_FLAG_RAW_ENCRYPT; } if (BP_IS_AUTHENTICATED(bp)) arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); if (BP_GET_LEVEL(bp) > 0) arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; acb->acb_zb = *zb; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; /* * Lock out L2ARC device removal. */ if (vdev_is_dead(vd) || !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) vd = NULL; } /* * We count both async reads and scrub IOs as asynchronous so * that both can be upgraded in the event of a cache hit while * the read IO is still in-flight. */ if (priority == ZIO_PRIORITY_ASYNC_READ || priority == ZIO_PRIORITY_SCRUB) arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); else arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); /* * At this point, we have a level 1 cache miss or a blkptr * with embedded data. Try again in L2ARC if possible. */ ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); /* * Skip ARC stat bump for block pointers with embedded * data. The data are read from the blkptr itself via * decode_embedded_bp_compressed(). */ if (!embedded_bp) { DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); zfs_racct_read(size, 1); } /* Check if the spa even has l2 configured */ const boolean_t spa_has_l2 = l2arc_ndev != 0 && spa->spa_l2cache.sav_count > 0; if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. * 2. This buffer still has L2ARC metadata. * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. * 5. This isn't prefetch or l2arc_noprefetch is 0. */ if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && !(l2arc_noprefetch && (*arc_flags & ARC_FLAG_PREFETCH))) { l2arc_read_callback_t *cb; abd_t *abd; uint64_t asize; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); hdr->b_l2hdr.b_hits++; cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; /* * When Compressed ARC is disabled, but the * L2ARC block is compressed, arc_hdr_size() * will have returned LSIZE rather than PSIZE. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr) && HDR_GET_PSIZE(hdr) != 0) { size = HDR_GET_PSIZE(hdr); } asize = vdev_psize_to_asize(vd, size); if (asize != size) { abd = abd_alloc_for_io(asize, HDR_ISTYPE_METADATA(hdr)); cb->l2rcb_abd = abd; } else { abd = hdr_abd; } ASSERT(addr >= VDEV_LABEL_START_SIZE && addr + asize <= vd->vdev_psize - VDEV_LABEL_END_SIZE); /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ ASSERT3U(arc_hdr_get_compress(hdr), !=, ZIO_COMPRESS_EMPTY); rzio = zio_read_phys(pio, vd, addr, asize, abd, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); acb->acb_zio_head = rzio; if (hash_lock != NULL) mutex_exit(hash_lock); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, HDR_GET_PSIZE(hdr)); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); goto out; } ASSERT(*arc_flags & ARC_FLAG_WAIT); if (zio_wait(rzio) == 0) goto out; /* l2arc read error; goto zio_read() */ if (hash_lock != NULL) mutex_enter(hash_lock); } else { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); if (HDR_L2_WRITING(hdr)) ARCSTAT_BUMP(arcstat_l2_rw_clash); spa_config_exit(spa, SCL_L2ARC, vd); } } else { if (vd != NULL) spa_config_exit(spa, SCL_L2ARC, vd); /* * Only a spa with l2 should contribute to l2 * miss stats. (Including the case of having a * faulted cache device - that's also a miss.) */ if (spa_has_l2) { /* * Skip ARC stat bump for block pointers with * embedded data. The data are read from the * blkptr itself via * decode_embedded_bp_compressed(). */ if (!embedded_bp) { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); } } } rzio = zio_read(pio, spa, bp, hdr_abd, size, arc_read_done, hdr, priority, zio_flags, zb); acb->acb_zio_head = rzio; if (hash_lock != NULL) mutex_exit(hash_lock); if (*arc_flags & ARC_FLAG_WAIT) { rc = zio_wait(rzio); goto out; } ASSERT(*arc_flags & ARC_FLAG_NOWAIT); zio_nowait(rzio); } out: /* embedded bps don't actually go to disk */ if (!embedded_bp) spa_read_history_add(spa, zb, *arc_flags); spl_fstrans_unmark(cookie); return (rc); done: if (done) done(NULL, zb, bp, buf, private); if (pio && rc != 0) { zio_t *zio = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); zio->io_error = rc; zio_nowait(zio); } goto out; } arc_prune_t * arc_add_prune_callback(arc_prune_func_t *func, void *private) { arc_prune_t *p; p = kmem_alloc(sizeof (*p), KM_SLEEP); p->p_pfunc = func; p->p_private = private; list_link_init(&p->p_node); zfs_refcount_create(&p->p_refcnt); mutex_enter(&arc_prune_mtx); zfs_refcount_add(&p->p_refcnt, &arc_prune_list); list_insert_head(&arc_prune_list, p); mutex_exit(&arc_prune_mtx); return (p); } void arc_remove_prune_callback(arc_prune_t *p) { boolean_t wait = B_FALSE; mutex_enter(&arc_prune_mtx); list_remove(&arc_prune_list, p); if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0) wait = B_TRUE; mutex_exit(&arc_prune_mtx); /* wait for arc_prune_task to finish */ if (wait) taskq_wait_outstanding(arc_prune_taskq, 0); ASSERT0(zfs_refcount_count(&p->p_refcnt)); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); } /* * Notify the arc that a block was freed, and thus will never be used again. */ void arc_freed(spa_t *spa, const blkptr_t *bp) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; uint64_t guid = spa_load_guid(spa); ASSERT(!BP_IS_EMBEDDED(bp)); hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; /* * We might be trying to free a block that is still doing I/O * (i.e. prefetch) or has some other reference (i.e. a dedup-ed, * dmu_sync-ed block). A block may also have a reference if it is * part of a dedup-ed, dmu_synced write. The dmu_sync() function would * have written the new block to its final resting place on disk but * without the dedup flag set. This would have left the hdr in the MRU * state and discoverable. When the txg finally syncs it detects that * the block was overridden in open context and issues an override I/O. * Since this is a dedup block, the override I/O will determine if the * block is already in the DDT. If so, then it will replace the io_bp * with the bp from the DDT and allow the I/O to finish. When the I/O * reaches the done callback, dbuf_write_override_done, it will * check to see if the io_bp and io_bp_override are identical. * If they are not, then it indicates that the bp was replaced with * the bp in the DDT and the override bp is freed. This allows * us to arrive here with a reference on a block that is being * freed. So if we have an I/O in progress, or a reference to * this hdr, then we don't destroy the hdr. */ if (!HDR_HAS_L1HDR(hdr) || zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); mutex_exit(hash_lock); } else { mutex_exit(hash_lock); } } /* * Release this buffer from the cache, making it an anonymous buffer. This * must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ void arc_release(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * It would be nice to assert that if its DMU metadata (level > * 0 || it's the dnode file), then it must be syncing context. * But we don't know that information at this level. */ ASSERT(HDR_HAS_L1HDR(hdr)); /* * We don't grab the hash lock prior to this check, because if * the buffer's header is in the arc_anon state, it won't be * linked into the hash table. */ if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf); + ASSERT(ARC_BUF_LAST(buf)); ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); hdr->b_l1hdr.b_arc_access = 0; /* * If the buf is being overridden then it may already * have a hdr that is not empty. */ buf_discard_identity(hdr); arc_buf_thaw(buf); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); /* * This assignment is only valid as long as the hash_lock is * held, we must be careful not to reference state or the * b_state field after dropping the lock. */ arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(state, !=, arc_anon); /* this buffer is not on any list */ ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); if (HDR_HAS_L2HDR(hdr)) { mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); /* * We have to recheck this conditional again now that * we're holding the l2ad_mtx to prevent a race with * another thread which might be concurrently calling * l2arc_evict(). In that case, l2arc_evict() might have * destroyed the header's L2 portion as we were waiting * to acquire the l2ad_mtx. */ if (HDR_HAS_L2HDR(hdr)) arc_hdr_l2hdr_destroy(hdr); mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); } /* * Do we have more than one buf? */ - if (hdr->b_l1hdr.b_bufcnt > 1) { + if (hdr->b_l1hdr.b_buf != buf || !ARC_BUF_LAST(buf)) { arc_buf_hdr_t *nhdr; uint64_t spa = hdr->b_spa; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t lsize = HDR_GET_LSIZE(hdr); boolean_t protected = HDR_PROTECTED(hdr); enum zio_compress compress = arc_hdr_get_compress(hdr); arc_buf_contents_t type = arc_buf_type(hdr); VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); VERIFY3S(remove_reference(hdr, tag), >, 0); if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); ASSERT(ARC_BUF_LAST(buf)); } /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. Also find the last buffer * in the hdr's buffer list. */ arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); ASSERT3P(lastbuf, !=, NULL); /* * If the current arc_buf_t and the hdr are sharing their data * buffer, then we must stop sharing that block. */ if (arc_buf_is_shared(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); VERIFY(!arc_buf_is_shared(lastbuf)); /* * First, sever the block sharing relationship between * buf and the arc_buf_hdr_t. */ arc_unshare_buf(hdr, buf); /* * Now we need to recreate the hdr's b_pabd. Since we * have lastbuf handy, we try to share with it, but if * we can't then we allocate a new b_pabd and copy the * data from buf into it. */ if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { arc_hdr_alloc_abd(hdr, 0); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, psize); } VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end * of the list. Compressed buffers don't have the * same requirements. This makes it hard to * simply assert that the lastbuf is shared so * we rely on the hdr's compression flags to determine * if we have a compressed, shared buffer. */ ASSERT(arc_buf_is_shared(lastbuf) || arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); ASSERT(!ARC_BUF_SHARED(buf)); } ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT3P(state, !=, arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_size[type], arc_buf_size(buf), buf); if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); (void) zfs_refcount_remove_many( &state->arcs_esize[type], arc_buf_size(buf), buf); } - hdr->b_l1hdr.b_bufcnt -= 1; - if (ARC_BUF_ENCRYPTED(buf)) - hdr->b_crypt_hdr.b_ebufcnt -= 1; - arc_cksum_verify(buf); arc_buf_unwatch(buf); /* if this is the last uncompressed buf free the checksum */ if (!arc_hdr_has_uncompressed_buf(hdr)) arc_cksum_free(hdr); mutex_exit(hash_lock); nhdr = arc_hdr_alloc(spa, psize, lsize, protected, compress, hdr->b_complevel, type); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); - ASSERT0(nhdr->b_l1hdr.b_bufcnt); ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); VERIFY3U(nhdr->b_type, ==, type); ASSERT(!HDR_SHARED_DATA(nhdr)); nhdr->b_l1hdr.b_buf = buf; - nhdr->b_l1hdr.b_bufcnt = 1; - if (ARC_BUF_ENCRYPTED(buf)) - nhdr->b_crypt_hdr.b_ebufcnt = 1; (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; (void) zfs_refcount_add_many(&arc_anon->arcs_size[type], arc_buf_size(buf), buf); } else { ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); /* protected by hash lock, or hdr is on arc_anon */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; arc_change_state(arc_anon, hdr); hdr->b_l1hdr.b_arc_access = 0; mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } } int arc_released(arc_buf_t *buf) { return (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_state == arc_anon); } #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { return (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); } #endif static void arc_write_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; blkptr_t *bp = zio->io_bp; uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp); fstrans_cookie_t cookie = spl_fstrans_mark(); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); /* * If we're reexecuting this zio because the pool suspended, then * cleanup any state that was previously set the first time the * callback was invoked. */ if (zio->io_flags & ZIO_FLAG_REEXECUTED) { arc_cksum_free(hdr); arc_buf_unwatch(buf); if (hdr->b_l1hdr.b_pabd != NULL) { if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { arc_hdr_free_abd(hdr, B_FALSE); } } if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); } ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); callback->awcb_ready(zio, buf, callback->awcb_private); if (HDR_IO_IN_PROGRESS(hdr)) { ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ } - if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr)) - hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp)); - if (BP_IS_PROTECTED(bp)) { /* ZIL blocks are written through zio_rewrite */ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); - ASSERT(HDR_PROTECTED(hdr)); if (BP_SHOULD_BYTESWAP(bp)) { if (BP_GET_LEVEL(bp) > 0) { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; } else { hdr->b_l1hdr.b_byteswap = DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); } } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } + arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv); zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); + } else { + arc_hdr_clear_flags(hdr, ARC_FLAG_PROTECTED); } /* * If this block was written for raw encryption but the zio layer * ended up only authenticating it, adjust the buffer flags now. */ if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) { arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF) buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; } else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) { buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; } /* this must be done after the buffer flags are adjusted */ arc_cksum_compute(buf); enum zio_compress compress; if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { compress = ZIO_COMPRESS_OFF; } else { ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); compress = BP_GET_COMPRESS(bp); } HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); hdr->b_complevel = zio->io_prop.zp_complevel; if (zio->io_error != 0 || psize == 0) goto out; /* * Fill the hdr with data. If the buffer is encrypted we have no choice * but to copy the data into b_radb. If the hdr is compressed, the data * we want is available from the zio, otherwise we can take it from * the buf. * * We might be able to share the buf's data with the hdr here. However, * doing so would cause the ARC to be full of linear ABDs if we write a * lot of shareable data. As a compromise, we check whether scattered * ABDs are allowed, and assume that if they are then the user wants * the ARC to be primarily filled with them regardless of the data being * written. Therefore, if they're allowed then we allocate one and copy * the data into it; otherwise, we share the data directly if we can. */ if (ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(psize, >, 0); ASSERT(ARC_BUF_COMPRESSED(buf)); arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (!(HDR_UNCACHED(hdr) || abd_size_alloc_linear(arc_buf_size(buf))) || !arc_can_share(hdr, buf)) { /* * Ideally, we would always copy the io_abd into b_pabd, but the * user may have disabled compressed ARC, thus we must check the * hdr's compression setting rather than the io_bp's. */ if (BP_IS_ENCRYPTED(bp)) { ASSERT3U(psize, >, 0); arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && !ARC_BUF_COMPRESSED(buf)) { ASSERT3U(psize, >, 0); arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } } else { ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf); + ASSERT(ARC_BUF_LAST(buf)); arc_share_buf(hdr, buf); } out: arc_hdr_verify(hdr, bp); spl_fstrans_unmark(cookie); } static void arc_write_children_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; callback->awcb_children_ready(zio, buf, callback->awcb_private); } static void arc_write_done(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); } } else { ASSERT(HDR_EMPTY(hdr)); } /* * If the block to be written was all-zero or compressed enough to be * embedded in the BP, no write was performed so there will be no * dva/birth/checksum. The buffer must therefore remain anonymous * (and uncached). */ if (!HDR_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; ASSERT3U(zio->io_error, ==, 0); arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); if (exists != NULL) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); ASSERT(zfs_refcount_is_zero( &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists); arc_hdr_destroy(exists); mutex_exit(hash_lock); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { /* nopwrite */ ASSERT(zio->io_prop.zp_nopwrite); if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad nopwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); } else { /* Dedup */ - ASSERT(hdr->b_l1hdr.b_bufcnt == 1); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); + ASSERT(ARC_BUF_LAST(hdr->b_l1hdr.b_buf)); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); VERIFY3S(remove_reference(hdr, hdr), >, 0); /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, 0, B_FALSE); mutex_exit(hash_lock); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); VERIFY3S(remove_reference(hdr, hdr), >, 0); } callback->awcb_done(zio, buf, callback->awcb_private); abd_free(zio->io_abd); kmem_free(callback, sizeof (arc_write_callback_t)); } zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, arc_write_done_func_t *children_ready, arc_write_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; zio_t *zio; zio_prop_t localprop = *zp; ASSERT3P(ready, !=, NULL); ASSERT3P(done, !=, NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); if (uncached) arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED); else if (l2arc) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (ARC_BUF_ENCRYPTED(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); localprop.zp_encrypt = B_TRUE; localprop.zp_compress = HDR_GET_COMPRESS(hdr); localprop.zp_complevel = hdr->b_complevel; localprop.zp_byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; memcpy(localprop.zp_salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); memcpy(localprop.zp_iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); memcpy(localprop.zp_mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) { localprop.zp_nopwrite = B_FALSE; localprop.zp_copies = MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1); } zio_flags |= ZIO_FLAG_RAW; } else if (ARC_BUF_COMPRESSED(buf)) { ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); localprop.zp_compress = HDR_GET_COMPRESS(hdr); localprop.zp_complevel = hdr->b_complevel; zio_flags |= ZIO_FLAG_RAW_COMPRESS; } callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; /* * The hdr's b_pabd is now stale, free it now. A new data block * will be allocated when the zio pipeline calls arc_write_ready(). */ if (hdr->b_l1hdr.b_pabd != NULL) { /* * If the buf is currently sharing the data block with * the hdr then we need to break that relationship here. * The hdr will remain with a NULL data pointer and the * buf will take sole ownership of the block. */ if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { arc_hdr_free_abd(hdr, B_FALSE); } VERIFY3P(buf->b_data, !=, NULL); } if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); if (!(zio_flags & ZIO_FLAG_RAW)) arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); ASSERT(!arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); zio = zio_write(pio, spa, txg, bp, abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_done, callback, priority, zio_flags, zb); return (zio); } void arc_tempreserve_clear(uint64_t reserve) { atomic_add_64(&arc_tempreserve, -reserve); ASSERT((int64_t)arc_tempreserve >= 0); } int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) { int error; uint64_t anon_size; if (!arc_no_grow && reserve > arc_c/4 && reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT)) arc_c = MIN(arc_c_max, reserve * 4); /* * Throttle when the calculated memory footprint for the TXG * exceeds the target ARC size. */ if (reserve > arc_c) { DMU_TX_STAT_BUMP(dmu_tx_memory_reserve); return (SET_ERROR(ERESTART)); } /* * Don't count loaned bufs as in flight dirty data to prevent long * network delays from blocking transactions that are ready to be * assigned to a txg. */ /* assert that it has not wrapped around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); anon_size = MAX((int64_t) (zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]) + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]) - arc_loaned_bytes), 0); /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefore need to * make sure that there is sufficient available memory for this. */ error = arc_memory_throttle(spa, reserve, txg); if (error != 0) return (error); /* * Throttle writes when the amount of dirty data in the cache * gets too large. We try to keep the cache less than half full * of dirty blocks so that our sync times don't grow too large. * * In the case of one pool being built on another pool, we want * to make sure we don't end up throttling the lower (backing) * pool when the upper pool is the majority contributor to dirty * data. To insure we make forward progress during throttling, we * also check the current pool's net dirty data and only throttle * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty * data in the cache. * * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ uint64_t total_dirty = reserve + arc_tempreserve + anon_size; uint64_t spa_dirty_anon = spa_dirty_data(spa); uint64_t rarc_c = arc_warm ? arc_c : arc_c_max; if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 && anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 && spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { #ifdef ZFS_DEBUG uint64_t meta_esize = zfs_refcount_count( &arc_anon->arcs_esize[ARC_BUFC_METADATA]); uint64_t data_esize = zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n", (u_longlong_t)arc_tempreserve >> 10, (u_longlong_t)meta_esize >> 10, (u_longlong_t)data_esize >> 10, (u_longlong_t)reserve >> 10, (u_longlong_t)rarc_c >> 10); #endif DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); return (SET_ERROR(ERESTART)); } atomic_add_64(&arc_tempreserve, reserve); return (0); } static void arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, kstat_named_t *data, kstat_named_t *metadata, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { data->value.ui64 = zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]); metadata->value.ui64 = zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]); size->value.ui64 = data->value.ui64 + metadata->value.ui64; evict_data->value.ui64 = zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); evict_metadata->value.ui64 = zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); } static int arc_kstat_update(kstat_t *ksp, int rw) { arc_stats_t *as = ksp->ks_data; if (rw == KSTAT_WRITE) return (SET_ERROR(EACCES)); as->arcstat_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_hits); as->arcstat_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_iohits); as->arcstat_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_misses); as->arcstat_demand_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_hits); as->arcstat_demand_data_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_iohits); as->arcstat_demand_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_misses); as->arcstat_demand_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_hits); as->arcstat_demand_metadata_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_iohits); as->arcstat_demand_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_misses); as->arcstat_prefetch_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_hits); as->arcstat_prefetch_data_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_iohits); as->arcstat_prefetch_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_misses); as->arcstat_prefetch_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits); as->arcstat_prefetch_metadata_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_iohits); as->arcstat_prefetch_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses); as->arcstat_mru_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mru_hits); as->arcstat_mru_ghost_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mru_ghost_hits); as->arcstat_mfu_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mfu_hits); as->arcstat_mfu_ghost_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mfu_ghost_hits); as->arcstat_uncached_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_uncached_hits); as->arcstat_deleted.value.ui64 = wmsum_value(&arc_sums.arcstat_deleted); as->arcstat_mutex_miss.value.ui64 = wmsum_value(&arc_sums.arcstat_mutex_miss); as->arcstat_access_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_access_skip); as->arcstat_evict_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_skip); as->arcstat_evict_not_enough.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_not_enough); as->arcstat_evict_l2_cached.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_cached); as->arcstat_evict_l2_eligible.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_eligible); as->arcstat_evict_l2_eligible_mfu.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mfu); as->arcstat_evict_l2_eligible_mru.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mru); as->arcstat_evict_l2_ineligible.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_ineligible); as->arcstat_evict_l2_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_skip); as->arcstat_hash_collisions.value.ui64 = wmsum_value(&arc_sums.arcstat_hash_collisions); as->arcstat_hash_chains.value.ui64 = wmsum_value(&arc_sums.arcstat_hash_chains); as->arcstat_size.value.ui64 = aggsum_value(&arc_sums.arcstat_size); as->arcstat_compressed_size.value.ui64 = wmsum_value(&arc_sums.arcstat_compressed_size); as->arcstat_uncompressed_size.value.ui64 = wmsum_value(&arc_sums.arcstat_uncompressed_size); as->arcstat_overhead_size.value.ui64 = wmsum_value(&arc_sums.arcstat_overhead_size); as->arcstat_hdr_size.value.ui64 = wmsum_value(&arc_sums.arcstat_hdr_size); as->arcstat_data_size.value.ui64 = wmsum_value(&arc_sums.arcstat_data_size); as->arcstat_metadata_size.value.ui64 = wmsum_value(&arc_sums.arcstat_metadata_size); as->arcstat_dbuf_size.value.ui64 = wmsum_value(&arc_sums.arcstat_dbuf_size); #if defined(COMPAT_FREEBSD11) as->arcstat_other_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size) + wmsum_value(&arc_sums.arcstat_dnode_size) + wmsum_value(&arc_sums.arcstat_dbuf_size); #endif arc_kstat_update_state(arc_anon, &as->arcstat_anon_size, &as->arcstat_anon_data, &as->arcstat_anon_metadata, &as->arcstat_anon_evictable_data, &as->arcstat_anon_evictable_metadata); arc_kstat_update_state(arc_mru, &as->arcstat_mru_size, &as->arcstat_mru_data, &as->arcstat_mru_metadata, &as->arcstat_mru_evictable_data, &as->arcstat_mru_evictable_metadata); arc_kstat_update_state(arc_mru_ghost, &as->arcstat_mru_ghost_size, &as->arcstat_mru_ghost_data, &as->arcstat_mru_ghost_metadata, &as->arcstat_mru_ghost_evictable_data, &as->arcstat_mru_ghost_evictable_metadata); arc_kstat_update_state(arc_mfu, &as->arcstat_mfu_size, &as->arcstat_mfu_data, &as->arcstat_mfu_metadata, &as->arcstat_mfu_evictable_data, &as->arcstat_mfu_evictable_metadata); arc_kstat_update_state(arc_mfu_ghost, &as->arcstat_mfu_ghost_size, &as->arcstat_mfu_ghost_data, &as->arcstat_mfu_ghost_metadata, &as->arcstat_mfu_ghost_evictable_data, &as->arcstat_mfu_ghost_evictable_metadata); arc_kstat_update_state(arc_uncached, &as->arcstat_uncached_size, &as->arcstat_uncached_data, &as->arcstat_uncached_metadata, &as->arcstat_uncached_evictable_data, &as->arcstat_uncached_evictable_metadata); as->arcstat_dnode_size.value.ui64 = wmsum_value(&arc_sums.arcstat_dnode_size); as->arcstat_bonus_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size); as->arcstat_l2_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_hits); as->arcstat_l2_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_misses); as->arcstat_l2_prefetch_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_prefetch_asize); as->arcstat_l2_mru_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_mru_asize); as->arcstat_l2_mfu_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_mfu_asize); as->arcstat_l2_bufc_data_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_bufc_data_asize); as->arcstat_l2_bufc_metadata_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_bufc_metadata_asize); as->arcstat_l2_feeds.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_feeds); as->arcstat_l2_rw_clash.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rw_clash); as->arcstat_l2_read_bytes.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_read_bytes); as->arcstat_l2_write_bytes.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_write_bytes); as->arcstat_l2_writes_sent.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_sent); as->arcstat_l2_writes_done.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_done); as->arcstat_l2_writes_error.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_error); as->arcstat_l2_writes_lock_retry.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_lock_retry); as->arcstat_l2_evict_lock_retry.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_evict_lock_retry); as->arcstat_l2_evict_reading.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_evict_reading); as->arcstat_l2_evict_l1cached.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_evict_l1cached); as->arcstat_l2_free_on_write.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_free_on_write); as->arcstat_l2_abort_lowmem.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_abort_lowmem); as->arcstat_l2_cksum_bad.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_cksum_bad); as->arcstat_l2_io_error.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_io_error); as->arcstat_l2_lsize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_lsize); as->arcstat_l2_psize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_psize); as->arcstat_l2_hdr_size.value.ui64 = aggsum_value(&arc_sums.arcstat_l2_hdr_size); as->arcstat_l2_log_blk_writes.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_log_blk_writes); as->arcstat_l2_log_blk_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_log_blk_asize); as->arcstat_l2_log_blk_count.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_log_blk_count); as->arcstat_l2_rebuild_success.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_success); as->arcstat_l2_rebuild_abort_unsupported.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_unsupported); as->arcstat_l2_rebuild_abort_io_errors.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_io_errors); as->arcstat_l2_rebuild_abort_dh_errors.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_dh_errors); as->arcstat_l2_rebuild_abort_cksum_lb_errors.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors); as->arcstat_l2_rebuild_abort_lowmem.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_lowmem); as->arcstat_l2_rebuild_size.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_size); as->arcstat_l2_rebuild_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_asize); as->arcstat_l2_rebuild_bufs.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs); as->arcstat_l2_rebuild_bufs_precached.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs_precached); as->arcstat_l2_rebuild_log_blks.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_log_blks); as->arcstat_memory_throttle_count.value.ui64 = wmsum_value(&arc_sums.arcstat_memory_throttle_count); as->arcstat_memory_direct_count.value.ui64 = wmsum_value(&arc_sums.arcstat_memory_direct_count); as->arcstat_memory_indirect_count.value.ui64 = wmsum_value(&arc_sums.arcstat_memory_indirect_count); as->arcstat_memory_all_bytes.value.ui64 = arc_all_memory(); as->arcstat_memory_free_bytes.value.ui64 = arc_free_memory(); as->arcstat_memory_available_bytes.value.i64 = arc_available_memory(); as->arcstat_prune.value.ui64 = wmsum_value(&arc_sums.arcstat_prune); as->arcstat_meta_used.value.ui64 = wmsum_value(&arc_sums.arcstat_meta_used); as->arcstat_async_upgrade_sync.value.ui64 = wmsum_value(&arc_sums.arcstat_async_upgrade_sync); as->arcstat_predictive_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_predictive_prefetch); as->arcstat_demand_hit_predictive_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch); as->arcstat_demand_iohit_predictive_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_iohit_predictive_prefetch); as->arcstat_prescient_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_prescient_prefetch); as->arcstat_demand_hit_prescient_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch); as->arcstat_demand_iohit_prescient_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_iohit_prescient_prefetch); as->arcstat_raw_size.value.ui64 = wmsum_value(&arc_sums.arcstat_raw_size); as->arcstat_cached_only_in_progress.value.ui64 = wmsum_value(&arc_sums.arcstat_cached_only_in_progress); as->arcstat_abd_chunk_waste_size.value.ui64 = wmsum_value(&arc_sums.arcstat_abd_chunk_waste_size); return (0); } /* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the ARC eviction * code is laid out; arc_evict_state() assumes ARC buffers are evenly * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ static unsigned int arc_state_multilist_index_func(multilist_t *ml, void *obj) { arc_buf_hdr_t *hdr = obj; /* * We rely on b_dva to generate evenly distributed index * numbers using buf_hash below. So, as an added precaution, * let's make sure we never add empty buffers to the arc lists. */ ASSERT(!HDR_EMPTY(hdr)); /* * The assumption here, is the hash value for a given * arc_buf_hdr_t will remain constant throughout its lifetime * (i.e. its b_spa, b_dva, and b_birth fields don't change). * Thus, we don't need to store the header's sublist index * on insertion, as this index can be recalculated on removal. * * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. In this context full 64bit * division would be a waste of time, so limit it to 32 bits. */ return ((unsigned int)buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % multilist_get_num_sublists(ml)); } static unsigned int arc_state_l2c_multilist_index_func(multilist_t *ml, void *obj) { panic("Header %p insert into arc_l2c_only %p", obj, ml); } #define WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do { \ if ((do_warn) && (tuning) && ((tuning) != (value))) { \ cmn_err(CE_WARN, \ "ignoring tunable %s (using %llu instead)", \ (#tuning), (u_longlong_t)(value)); \ } \ } while (0) /* * Called during module initialization and periodically thereafter to * apply reasonable changes to the exposed performance tunings. Can also be * called explicitly by param_set_arc_*() functions when ARC tunables are * updated manually. Non-zero zfs_* values which differ from the currently set * values will be applied. */ void arc_tuning_update(boolean_t verbose) { uint64_t allmem = arc_all_memory(); /* Valid range: 32M - */ if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) && (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) && (zfs_arc_min <= arc_c_max)) { arc_c_min = zfs_arc_min; arc_c = MAX(arc_c, arc_c_min); } WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose); /* Valid range: 64M - */ if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) && (zfs_arc_max >= MIN_ARC_MAX) && (zfs_arc_max < allmem) && (zfs_arc_max > arc_c_min)) { arc_c_max = zfs_arc_max; arc_c = MIN(arc_c, arc_c_max); if (arc_dnode_limit > arc_c_max) arc_dnode_limit = arc_c_max; } WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose); /* Valid range: 0 - */ arc_dnode_limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit : MIN(zfs_arc_dnode_limit_percent, 100) * arc_c_max / 100; WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_limit, verbose); /* Valid range: 1 - N */ if (zfs_arc_grow_retry) arc_grow_retry = zfs_arc_grow_retry; /* Valid range: 1 - N */ if (zfs_arc_shrink_shift) { arc_shrink_shift = zfs_arc_shrink_shift; arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1); } /* Valid range: 1 - N ms */ if (zfs_arc_min_prefetch_ms) arc_min_prefetch_ms = zfs_arc_min_prefetch_ms; /* Valid range: 1 - N ms */ if (zfs_arc_min_prescient_prefetch_ms) { arc_min_prescient_prefetch_ms = zfs_arc_min_prescient_prefetch_ms; } /* Valid range: 0 - 100 */ if (zfs_arc_lotsfree_percent <= 100) arc_lotsfree_percent = zfs_arc_lotsfree_percent; WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent, verbose); /* Valid range: 0 - */ if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free)) arc_sys_free = MIN(zfs_arc_sys_free, allmem); WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose); } static void arc_state_multilist_init(multilist_t *ml, multilist_sublist_index_func_t *index_func, int *maxcountp) { multilist_create(ml, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), index_func); *maxcountp = MAX(*maxcountp, multilist_get_num_sublists(ml)); } static void arc_state_init(void) { int num_sublists = 0; arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); /* * L2 headers should never be on the L2 state list since they don't * have L1 headers allocated. Special index function asserts that. */ arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], arc_state_l2c_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], arc_state_l2c_multilist_index_func, &num_sublists); /* * Keep track of the number of markers needed to reclaim buffers from * any ARC state. The markers will be pre-allocated so as to minimize * the number of memory allocations performed by the eviction thread. */ arc_state_evict_marker_count = num_sublists; zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_METADATA]); wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA], 0); wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA], 0); wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA], 0); wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA], 0); wmsum_init(&arc_sums.arcstat_hits, 0); wmsum_init(&arc_sums.arcstat_iohits, 0); wmsum_init(&arc_sums.arcstat_misses, 0); wmsum_init(&arc_sums.arcstat_demand_data_hits, 0); wmsum_init(&arc_sums.arcstat_demand_data_iohits, 0); wmsum_init(&arc_sums.arcstat_demand_data_misses, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_iohits, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_iohits, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_iohits, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_mru_hits, 0); wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0); wmsum_init(&arc_sums.arcstat_mfu_hits, 0); wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0); wmsum_init(&arc_sums.arcstat_uncached_hits, 0); wmsum_init(&arc_sums.arcstat_deleted, 0); wmsum_init(&arc_sums.arcstat_mutex_miss, 0); wmsum_init(&arc_sums.arcstat_access_skip, 0); wmsum_init(&arc_sums.arcstat_evict_skip, 0); wmsum_init(&arc_sums.arcstat_evict_not_enough, 0); wmsum_init(&arc_sums.arcstat_evict_l2_cached, 0); wmsum_init(&arc_sums.arcstat_evict_l2_eligible, 0); wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mfu, 0); wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mru, 0); wmsum_init(&arc_sums.arcstat_evict_l2_ineligible, 0); wmsum_init(&arc_sums.arcstat_evict_l2_skip, 0); wmsum_init(&arc_sums.arcstat_hash_collisions, 0); wmsum_init(&arc_sums.arcstat_hash_chains, 0); aggsum_init(&arc_sums.arcstat_size, 0); wmsum_init(&arc_sums.arcstat_compressed_size, 0); wmsum_init(&arc_sums.arcstat_uncompressed_size, 0); wmsum_init(&arc_sums.arcstat_overhead_size, 0); wmsum_init(&arc_sums.arcstat_hdr_size, 0); wmsum_init(&arc_sums.arcstat_data_size, 0); wmsum_init(&arc_sums.arcstat_metadata_size, 0); wmsum_init(&arc_sums.arcstat_dbuf_size, 0); wmsum_init(&arc_sums.arcstat_dnode_size, 0); wmsum_init(&arc_sums.arcstat_bonus_size, 0); wmsum_init(&arc_sums.arcstat_l2_hits, 0); wmsum_init(&arc_sums.arcstat_l2_misses, 0); wmsum_init(&arc_sums.arcstat_l2_prefetch_asize, 0); wmsum_init(&arc_sums.arcstat_l2_mru_asize, 0); wmsum_init(&arc_sums.arcstat_l2_mfu_asize, 0); wmsum_init(&arc_sums.arcstat_l2_bufc_data_asize, 0); wmsum_init(&arc_sums.arcstat_l2_bufc_metadata_asize, 0); wmsum_init(&arc_sums.arcstat_l2_feeds, 0); wmsum_init(&arc_sums.arcstat_l2_rw_clash, 0); wmsum_init(&arc_sums.arcstat_l2_read_bytes, 0); wmsum_init(&arc_sums.arcstat_l2_write_bytes, 0); wmsum_init(&arc_sums.arcstat_l2_writes_sent, 0); wmsum_init(&arc_sums.arcstat_l2_writes_done, 0); wmsum_init(&arc_sums.arcstat_l2_writes_error, 0); wmsum_init(&arc_sums.arcstat_l2_writes_lock_retry, 0); wmsum_init(&arc_sums.arcstat_l2_evict_lock_retry, 0); wmsum_init(&arc_sums.arcstat_l2_evict_reading, 0); wmsum_init(&arc_sums.arcstat_l2_evict_l1cached, 0); wmsum_init(&arc_sums.arcstat_l2_free_on_write, 0); wmsum_init(&arc_sums.arcstat_l2_abort_lowmem, 0); wmsum_init(&arc_sums.arcstat_l2_cksum_bad, 0); wmsum_init(&arc_sums.arcstat_l2_io_error, 0); wmsum_init(&arc_sums.arcstat_l2_lsize, 0); wmsum_init(&arc_sums.arcstat_l2_psize, 0); aggsum_init(&arc_sums.arcstat_l2_hdr_size, 0); wmsum_init(&arc_sums.arcstat_l2_log_blk_writes, 0); wmsum_init(&arc_sums.arcstat_l2_log_blk_asize, 0); wmsum_init(&arc_sums.arcstat_l2_log_blk_count, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_success, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_unsupported, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_io_errors, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_dh_errors, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_lowmem, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_size, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_asize, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs_precached, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_log_blks, 0); wmsum_init(&arc_sums.arcstat_memory_throttle_count, 0); wmsum_init(&arc_sums.arcstat_memory_direct_count, 0); wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0); wmsum_init(&arc_sums.arcstat_prune, 0); wmsum_init(&arc_sums.arcstat_meta_used, 0); wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0); wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_iohit_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_iohit_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_raw_size, 0); wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0); wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0); arc_anon->arcs_state = ARC_STATE_ANON; arc_mru->arcs_state = ARC_STATE_MRU; arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; arc_mfu->arcs_state = ARC_STATE_MFU; arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; arc_uncached->arcs_state = ARC_STATE_UNCACHED; } static void arc_state_fini(void) { zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_DATA]); wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]); wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]); wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]); wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]); wmsum_fini(&arc_sums.arcstat_hits); wmsum_fini(&arc_sums.arcstat_iohits); wmsum_fini(&arc_sums.arcstat_misses); wmsum_fini(&arc_sums.arcstat_demand_data_hits); wmsum_fini(&arc_sums.arcstat_demand_data_iohits); wmsum_fini(&arc_sums.arcstat_demand_data_misses); wmsum_fini(&arc_sums.arcstat_demand_metadata_hits); wmsum_fini(&arc_sums.arcstat_demand_metadata_iohits); wmsum_fini(&arc_sums.arcstat_demand_metadata_misses); wmsum_fini(&arc_sums.arcstat_prefetch_data_hits); wmsum_fini(&arc_sums.arcstat_prefetch_data_iohits); wmsum_fini(&arc_sums.arcstat_prefetch_data_misses); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_iohits); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses); wmsum_fini(&arc_sums.arcstat_mru_hits); wmsum_fini(&arc_sums.arcstat_mru_ghost_hits); wmsum_fini(&arc_sums.arcstat_mfu_hits); wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits); wmsum_fini(&arc_sums.arcstat_uncached_hits); wmsum_fini(&arc_sums.arcstat_deleted); wmsum_fini(&arc_sums.arcstat_mutex_miss); wmsum_fini(&arc_sums.arcstat_access_skip); wmsum_fini(&arc_sums.arcstat_evict_skip); wmsum_fini(&arc_sums.arcstat_evict_not_enough); wmsum_fini(&arc_sums.arcstat_evict_l2_cached); wmsum_fini(&arc_sums.arcstat_evict_l2_eligible); wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mfu); wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mru); wmsum_fini(&arc_sums.arcstat_evict_l2_ineligible); wmsum_fini(&arc_sums.arcstat_evict_l2_skip); wmsum_fini(&arc_sums.arcstat_hash_collisions); wmsum_fini(&arc_sums.arcstat_hash_chains); aggsum_fini(&arc_sums.arcstat_size); wmsum_fini(&arc_sums.arcstat_compressed_size); wmsum_fini(&arc_sums.arcstat_uncompressed_size); wmsum_fini(&arc_sums.arcstat_overhead_size); wmsum_fini(&arc_sums.arcstat_hdr_size); wmsum_fini(&arc_sums.arcstat_data_size); wmsum_fini(&arc_sums.arcstat_metadata_size); wmsum_fini(&arc_sums.arcstat_dbuf_size); wmsum_fini(&arc_sums.arcstat_dnode_size); wmsum_fini(&arc_sums.arcstat_bonus_size); wmsum_fini(&arc_sums.arcstat_l2_hits); wmsum_fini(&arc_sums.arcstat_l2_misses); wmsum_fini(&arc_sums.arcstat_l2_prefetch_asize); wmsum_fini(&arc_sums.arcstat_l2_mru_asize); wmsum_fini(&arc_sums.arcstat_l2_mfu_asize); wmsum_fini(&arc_sums.arcstat_l2_bufc_data_asize); wmsum_fini(&arc_sums.arcstat_l2_bufc_metadata_asize); wmsum_fini(&arc_sums.arcstat_l2_feeds); wmsum_fini(&arc_sums.arcstat_l2_rw_clash); wmsum_fini(&arc_sums.arcstat_l2_read_bytes); wmsum_fini(&arc_sums.arcstat_l2_write_bytes); wmsum_fini(&arc_sums.arcstat_l2_writes_sent); wmsum_fini(&arc_sums.arcstat_l2_writes_done); wmsum_fini(&arc_sums.arcstat_l2_writes_error); wmsum_fini(&arc_sums.arcstat_l2_writes_lock_retry); wmsum_fini(&arc_sums.arcstat_l2_evict_lock_retry); wmsum_fini(&arc_sums.arcstat_l2_evict_reading); wmsum_fini(&arc_sums.arcstat_l2_evict_l1cached); wmsum_fini(&arc_sums.arcstat_l2_free_on_write); wmsum_fini(&arc_sums.arcstat_l2_abort_lowmem); wmsum_fini(&arc_sums.arcstat_l2_cksum_bad); wmsum_fini(&arc_sums.arcstat_l2_io_error); wmsum_fini(&arc_sums.arcstat_l2_lsize); wmsum_fini(&arc_sums.arcstat_l2_psize); aggsum_fini(&arc_sums.arcstat_l2_hdr_size); wmsum_fini(&arc_sums.arcstat_l2_log_blk_writes); wmsum_fini(&arc_sums.arcstat_l2_log_blk_asize); wmsum_fini(&arc_sums.arcstat_l2_log_blk_count); wmsum_fini(&arc_sums.arcstat_l2_rebuild_success); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_unsupported); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_io_errors); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_dh_errors); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_lowmem); wmsum_fini(&arc_sums.arcstat_l2_rebuild_size); wmsum_fini(&arc_sums.arcstat_l2_rebuild_asize); wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs); wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs_precached); wmsum_fini(&arc_sums.arcstat_l2_rebuild_log_blks); wmsum_fini(&arc_sums.arcstat_memory_throttle_count); wmsum_fini(&arc_sums.arcstat_memory_direct_count); wmsum_fini(&arc_sums.arcstat_memory_indirect_count); wmsum_fini(&arc_sums.arcstat_prune); wmsum_fini(&arc_sums.arcstat_meta_used); wmsum_fini(&arc_sums.arcstat_async_upgrade_sync); wmsum_fini(&arc_sums.arcstat_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_demand_iohit_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_demand_iohit_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_raw_size); wmsum_fini(&arc_sums.arcstat_cached_only_in_progress); wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size); } uint64_t arc_target_bytes(void) { return (arc_c); } void arc_set_limits(uint64_t allmem) { /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */ arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT); /* How to set default max varies by platform. */ arc_c_max = arc_default_max(arc_c_min, allmem); } void arc_init(void) { uint64_t percent, allmem = arc_all_memory(); mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t), offsetof(arc_evict_waiter_t, aew_node)); arc_min_prefetch_ms = 1000; arc_min_prescient_prefetch_ms = 6000; #if defined(_KERNEL) arc_lowmem_init(); #endif arc_set_limits(allmem); #ifdef _KERNEL /* * If zfs_arc_max is non-zero at init, meaning it was set in the kernel * environment before the module was loaded, don't block setting the * maximum because it is less than arc_c_min, instead, reset arc_c_min * to a lower value. * zfs_arc_min will be handled by arc_tuning_update(). */ if (zfs_arc_max != 0 && zfs_arc_max >= MIN_ARC_MAX && zfs_arc_max < allmem) { arc_c_max = zfs_arc_max; if (arc_c_min >= arc_c_max) { arc_c_min = MAX(zfs_arc_max / 2, 2ULL << SPA_MAXBLOCKSHIFT); } } #else /* * In userland, there's only the memory pressure that we artificially * create (see arc_available_memory()). Don't let arc_c get too * small, because it can cause transactions to be larger than * arc_c, causing arc_tempreserve_space() to fail. */ arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT); #endif arc_c = arc_c_min; /* * 32-bit fixed point fractions of metadata from total ARC size, * MRU data from all data and MRU metadata from all metadata. */ arc_meta = (1ULL << 32) / 4; /* Metadata is 25% of arc_c. */ arc_pd = (1ULL << 32) / 2; /* Data MRU is 50% of data. */ arc_pm = (1ULL << 32) / 2; /* Metadata MRU is 50% of metadata. */ percent = MIN(zfs_arc_dnode_limit_percent, 100); arc_dnode_limit = arc_c_max * percent / 100; /* Apply user specified tunings */ arc_tuning_update(B_TRUE); /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; if (arc_c < arc_c_min) arc_c = arc_c_min; arc_register_hotplug(); arc_state_init(); buf_init(); list_create(&arc_prune_list, sizeof (arc_prune_t), offsetof(arc_prune_t, p_node)); mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads, defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (arc_ksp != NULL) { arc_ksp->ks_data = &arc_stats; arc_ksp->ks_update = arc_kstat_update; kstat_install(arc_ksp); } arc_state_evict_markers = arc_state_alloc_markers(arc_state_evict_marker_count); arc_evict_zthr = zthr_create_timer("arc_evict", arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1), defclsyspri); arc_reap_zthr = zthr_create_timer("arc_reap", arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri); arc_warm = B_FALSE; /* * Calculate maximum amount of dirty data per pool. * * If it has been set by a module parameter, take that. * Otherwise, use a percentage of physical memory defined by * zfs_dirty_data_max_percent (default 10%) with a cap at * zfs_dirty_data_max_max (default 4G or 25% of physical memory). */ #ifdef __LP64__ if (zfs_dirty_data_max_max == 0) zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024, allmem * zfs_dirty_data_max_max_percent / 100); #else if (zfs_dirty_data_max_max == 0) zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024, allmem * zfs_dirty_data_max_max_percent / 100); #endif if (zfs_dirty_data_max == 0) { zfs_dirty_data_max = allmem * zfs_dirty_data_max_percent / 100; zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } if (zfs_wrlog_data_max == 0) { /* * dp_wrlog_total is reduced for each txg at the end of * spa_sync(). However, dp_dirty_total is reduced every time * a block is written out. Thus under normal operation, * dp_wrlog_total could grow 2 times as big as * zfs_dirty_data_max. */ zfs_wrlog_data_max = zfs_dirty_data_max * 2; } } void arc_fini(void) { arc_prune_t *p; #ifdef _KERNEL arc_lowmem_fini(); #endif /* _KERNEL */ /* Use B_TRUE to ensure *all* buffers are evicted */ arc_flush(NULL, B_TRUE); if (arc_ksp != NULL) { kstat_delete(arc_ksp); arc_ksp = NULL; } taskq_wait(arc_prune_taskq); taskq_destroy(arc_prune_taskq); mutex_enter(&arc_prune_mtx); while ((p = list_remove_head(&arc_prune_list)) != NULL) { zfs_refcount_remove(&p->p_refcnt, &arc_prune_list); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); } mutex_exit(&arc_prune_mtx); list_destroy(&arc_prune_list); mutex_destroy(&arc_prune_mtx); (void) zthr_cancel(arc_evict_zthr); (void) zthr_cancel(arc_reap_zthr); arc_state_free_markers(arc_state_evict_markers, arc_state_evict_marker_count); mutex_destroy(&arc_evict_lock); list_destroy(&arc_evict_waiters); /* * Free any buffers that were tagged for destruction. This needs * to occur before arc_state_fini() runs and destroys the aggsum * values which are updated when freeing scatter ABDs. */ l2arc_do_free_on_write(); /* * buf_fini() must proceed arc_state_fini() because buf_fin() may * trigger the release of kmem magazines, which can callback to * arc_space_return() which accesses aggsums freed in act_state_fini(). */ buf_fini(); arc_state_fini(); arc_unregister_hotplug(); /* * We destroy the zthrs after all the ARC state has been * torn down to avoid the case of them receiving any * wakeup() signals after they are destroyed. */ zthr_destroy(arc_evict_zthr); zthr_destroy(arc_reap_zthr); ASSERT0(arc_loaned_bytes); } /* * Level 2 ARC * * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. * It uses dedicated storage devices to hold cached data, which are populated * using large infrequent writes. The main role of this cache is to boost * the performance of random read workloads. The intended L2ARC devices * include short-stroked disks, solid state disks, and other media with * substantially faster read latency than disk. * * +-----------------------+ * | ARC | * +-----------------------+ * | ^ ^ * | | | * l2arc_feed_thread() arc_read() * | | | * | l2arc read | * V | | * +---------------+ | * | L2ARC | | * +---------------+ | * | ^ | * l2arc_write() | | * | | | * V | | * +-------+ +-------+ * | vdev | | vdev | * | cache | | cache | * +-------+ +-------+ * +=========+ .-----. * : L2ARC : |-_____-| * : devices : | Disks | * +=========+ `-_____-' * * Read requests are satisfied from the following sources, in order: * * 1) ARC * 2) vdev cache of L2ARC devices * 3) L2ARC devices * 4) vdev cache of disks * 5) disks * * Some L2ARC device types exhibit extremely slow write performance. * To accommodate for this there are some significant differences between * the L2ARC and traditional cache design: * * 1. There is no eviction path from the ARC to the L2ARC. Evictions from * the ARC behave as usual, freeing buffers and placing headers on ghost * lists. The ARC does not send buffers to the L2ARC during eviction as * this would add inflated write latencies for all ARC memory pressure. * * 2. The L2ARC attempts to cache data from the ARC before it is evicted. * It does this by periodically scanning buffers from the eviction-end of * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are * not already there. It scans until a headroom of buffers is satisfied, * which itself is a buffer for ARC eviction. If a compressible buffer is * found during scanning and selected for writing to an L2ARC device, we * temporarily boost scanning headroom during the next scan cycle to make * sure we adapt to compression effects (which might significantly reduce * the data volume we write to L2ARC). The thread that does this is * l2arc_feed_thread(), illustrated below; example sizes are included to * provide a better sense of ratio than this diagram: * * head --> tail * +---------------------+----------+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC * +---------------------+----------+ | o L2ARC eligible * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer * +---------------------+----------+ | * 15.9 Gbytes ^ 32 Mbytes | * headroom | * l2arc_feed_thread() * | * l2arc write hand <--[oooo]--' * | 8 Mbyte * | write max * V * +==============================+ * L2ARC dev |####|#|###|###| |####| ... | * +==============================+ * 32 Gbytes * * 3. If an ARC buffer is copied to the L2ARC but then hit instead of * evicted, then the L2ARC has cached a buffer much sooner than it probably * needed to, potentially wasting L2ARC device bandwidth and storage. It is * safe to say that this is an uncommon case, since buffers at the end of * the ARC lists have moved there due to inactivity. * * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, * then the L2ARC simply misses copying some buffers. This serves as a * pressure valve to prevent heavy read workloads from both stalling the ARC * with waits and clogging the L2ARC with writes. This also helps prevent * the potential for the L2ARC to churn if it attempts to cache content too * quickly, such as during backups of the entire pool. * * 5. After system boot and before the ARC has filled main memory, there are * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru * lists can remain mostly static. Instead of searching from tail of these * lists as pictured, the l2arc_feed_thread() will search from the list heads * for eligible buffers, greatly increasing its chance of finding them. * * The L2ARC device write speed is also boosted during this time so that * the L2ARC warms up faster. Since there have been no ARC evictions yet, * there are no L2ARC reads, and no fear of degrading read performance * through increased writes. * * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that * the vdev queue can aggregate them into larger and fewer writes. Each * device is written to in a rotor fashion, sweeping writes through * available space then repeating. * * 7. The L2ARC does not store dirty content. It never needs to flush * write buffers back to disk based storage. * * 8. If an ARC buffer is written (and dirtied) which also exists in the * L2ARC, the now stale L2ARC buffer is immediately dropped. * * The performance of the L2ARC can be tweaked by a number of tunables, which * may be necessary for different workloads: * * l2arc_write_max max write bytes per interval * l2arc_write_boost extra write bytes during device warmup * l2arc_noprefetch skip caching prefetched buffers * l2arc_headroom number of max device writes to precache * l2arc_headroom_boost when we find compressed buffers during ARC * scanning, we multiply headroom by this * percentage factor for the next scan cycle, * since more compressed buffers are likely to * be present * l2arc_feed_secs seconds between L2ARC writing * * Tunables may be removed or added as future performance improvements are * integrated, and also may become zpool properties. * * There are three key functions that control how the L2ARC warms up: * * l2arc_write_eligible() check if a buffer is eligible to cache * l2arc_write_size() calculate how much to write * l2arc_write_interval() calculate sleep delay between writes * * These three functions determine what to write, how much, and how quickly * to send writes. * * L2ARC persistence: * * When writing buffers to L2ARC, we periodically add some metadata to * make sure we can pick them up after reboot, thus dramatically reducing * the impact that any downtime has on the performance of storage systems * with large caches. * * The implementation works fairly simply by integrating the following two * modifications: * * *) When writing to the L2ARC, we occasionally write a "l2arc log block", * which is an additional piece of metadata which describes what's been * written. This allows us to rebuild the arc_buf_hdr_t structures of the * main ARC buffers. There are 2 linked-lists of log blocks headed by * dh_start_lbps[2]. We alternate which chain we append to, so they are * time-wise and offset-wise interleaved, but that is an optimization rather * than for correctness. The log block also includes a pointer to the * previous block in its chain. * * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device * for our header bookkeeping purposes. This contains a device header, * which contains our top-level reference structures. We update it each * time we write a new log block, so that we're able to locate it in the * L2ARC device. If this write results in an inconsistent device header * (e.g. due to power failure), we detect this by verifying the header's * checksum and simply fail to reconstruct the L2ARC after reboot. * * Implementation diagram: * * +=== L2ARC device (not to scale) ======================================+ * | ___two newest log block pointers__.__________ | * | / \dh_start_lbps[1] | * | / \ \dh_start_lbps[0]| * |.___/__. V V | * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---| * || hdr| ^ /^ /^ / / | * |+------+ ...--\-------/ \-----/--\------/ / | * | \--------------/ \--------------/ | * +======================================================================+ * * As can be seen on the diagram, rather than using a simple linked list, * we use a pair of linked lists with alternating elements. This is a * performance enhancement due to the fact that we only find out the * address of the next log block access once the current block has been * completely read in. Obviously, this hurts performance, because we'd be * keeping the device's I/O queue at only a 1 operation deep, thus * incurring a large amount of I/O round-trip latency. Having two lists * allows us to fetch two log blocks ahead of where we are currently * rebuilding L2ARC buffers. * * On-device data structures: * * L2ARC device header: l2arc_dev_hdr_phys_t * L2ARC log block: l2arc_log_blk_phys_t * * L2ARC reconstruction: * * When writing data, we simply write in the standard rotary fashion, * evicting buffers as we go and simply writing new data over them (writing * a new log block every now and then). This obviously means that once we * loop around the end of the device, we will start cutting into an already * committed log block (and its referenced data buffers), like so: * * current write head__ __old tail * \ / * V V * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |--> * ^ ^^^^^^^^^___________________________________ * | \ * <> may overwrite this blk and/or its bufs --' * * When importing the pool, we detect this situation and use it to stop * our scanning process (see l2arc_rebuild). * * There is one significant caveat to consider when rebuilding ARC contents * from an L2ARC device: what about invalidated buffers? Given the above * construction, we cannot update blocks which we've already written to amend * them to remove buffers which were invalidated. Thus, during reconstruction, * we might be populating the cache with buffers for data that's not on the * main pool anymore, or may have been overwritten! * * As it turns out, this isn't a problem. Every arc_read request includes * both the DVA and, crucially, the birth TXG of the BP the caller is * looking for. So even if the cache were populated by completely rotten * blocks for data that had been long deleted and/or overwritten, we'll * never actually return bad data from the cache, since the DVA with the * birth TXG uniquely identify a block in space and time - once created, * a block is immutable on disk. The worst thing we have done is wasted * some time and memory at l2arc rebuild to reconstruct outdated ARC * entries that will get dropped from the l2arc as it is being updated * with new blocks. * * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write * hand are not restored. This is done by saving the offset (in bytes) * l2arc_evict() has evicted to in the L2ARC device header and taking it * into account when restoring buffers. */ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) { /* * A buffer is *not* eligible for the L2ARC if it: * 1. belongs to a different spa. * 2. is already cached on the L2ARC. * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) return (B_FALSE); return (B_TRUE); } static uint64_t l2arc_write_size(l2arc_dev_t *dev) { uint64_t size; /* * Make sure our globals have meaningful values in case the user * altered them. */ size = l2arc_write_max; if (size == 0) { cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " "be greater than zero, resetting it to the default (%d)", L2ARC_WRITE_SIZE); size = l2arc_write_max = L2ARC_WRITE_SIZE; } if (arc_warm == B_FALSE) size += l2arc_write_boost; /* We need to add in the worst case scenario of log block overhead. */ size += l2arc_log_blk_overhead(size, dev); if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { /* * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) * times the writesize, whichever is greater. */ size += MAX(64 * 1024 * 1024, (size * l2arc_trim_ahead) / 100); } /* * Make sure the write size does not exceed the size of the cache * device. This is important in l2arc_evict(), otherwise infinite * iteration can occur. */ if (size > dev->l2ad_end - dev->l2ad_start) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " "plus the overhead of log blocks (persistent L2ARC, " "%llu bytes) exceeds the size of the cache device " "(guid %llu), resetting them to the default (%d)", (u_longlong_t)l2arc_log_blk_overhead(size, dev), (u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; if (l2arc_trim_ahead > 1) { cmn_err(CE_NOTE, "l2arc_trim_ahead set to 1"); l2arc_trim_ahead = 1; } if (arc_warm == B_FALSE) size += l2arc_write_boost; size += l2arc_log_blk_overhead(size, dev); if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { size += MAX(64 * 1024 * 1024, (size * l2arc_trim_ahead) / 100); } } return (size); } static clock_t l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) { clock_t interval, next, now; /* * If the ARC lists are busy, increase our write rate; if the * lists are stale, idle back. This is achieved by checking * how much we previously wrote - if it was more than half of * what we wanted, schedule the next write much sooner. */ if (l2arc_feed_again && wrote > (wanted / 2)) interval = (hz * l2arc_feed_min_ms) / 1000; else interval = hz * l2arc_feed_secs; now = ddi_get_lbolt(); next = MAX(now, MIN(now + interval, began + interval)); return (next); } /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. */ static l2arc_dev_t * l2arc_dev_get_next(void) { l2arc_dev_t *first, *next = NULL; /* * Lock out the removal of spas (spa_namespace_lock), then removal * of cache devices (l2arc_dev_mtx). Once a device has been selected, * both locks will be dropped and a spa config lock held instead. */ mutex_enter(&spa_namespace_lock); mutex_enter(&l2arc_dev_mtx); /* if there are no vdevs, there is nothing to do */ if (l2arc_ndev == 0) goto out; first = NULL; next = l2arc_dev_last; do { /* loop around the list looking for a non-faulted vdev */ if (next == NULL) { next = list_head(l2arc_dev_list); } else { next = list_next(l2arc_dev_list, next); if (next == NULL) next = list_head(l2arc_dev_list); } /* if we have come back to the start, bail out */ if (first == NULL) first = next; else if (next == first) break; ASSERT3P(next, !=, NULL); } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || next->l2ad_trim_all); /* if we were unable to find any usable vdevs, return NULL */ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || next->l2ad_trim_all) next = NULL; l2arc_dev_last = next; out: mutex_exit(&l2arc_dev_mtx); /* * Grab the config lock to prevent the 'next' device from being * removed while we are writing to it. */ if (next != NULL) spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); mutex_exit(&spa_namespace_lock); return (next); } /* * Free buffers that were tagged for destruction. */ static void l2arc_do_free_on_write(void) { l2arc_data_free_t *df; mutex_enter(&l2arc_free_on_write_mtx); while ((df = list_remove_head(l2arc_free_on_write)) != NULL) { ASSERT3P(df->l2df_abd, !=, NULL); abd_free(df->l2df_abd); kmem_free(df, sizeof (l2arc_data_free_t)); } mutex_exit(&l2arc_free_on_write_mtx); } /* * A write to a cache device has completed. Update all headers to allow * reads from these buffers to begin. */ static void l2arc_write_done(zio_t *zio) { l2arc_write_callback_t *cb; l2arc_lb_abd_buf_t *abd_buf; l2arc_lb_ptr_buf_t *lb_ptr_buf; l2arc_dev_t *dev; l2arc_dev_hdr_phys_t *l2dhdr; list_t *buflist; arc_buf_hdr_t *head, *hdr, *hdr_prev; kmutex_t *hash_lock; int64_t bytes_dropped = 0; cb = zio->io_private; ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; ASSERT3P(head, !=, NULL); buflist = &dev->l2ad_buflist; ASSERT3P(buflist, !=, NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); /* * All writes completed, or an error was hit. */ top: mutex_enter(&dev->l2ad_mtx); for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. We must retry so we * don't leave the ARC_FLAG_L2_WRITING bit set. */ ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); /* * We don't want to rescan the headers we've * already marked as having been written out, so * we reinsert the head node so we can pick up * where we left off. */ list_remove(buflist, head); list_insert_after(buflist, hdr, head); mutex_exit(&dev->l2ad_mtx); /* * We wait for the hash lock to become available * to try and prevent busy waiting, and increase * the chance we'll be able to acquire the lock * the next time around. */ mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } /* * We could not have been moved into the arc_l2c_only * state while in-flight due to our ARC_FLAG_L2_WRITING * bit being set. Let's just ensure that's being enforced. */ ASSERT(HDR_HAS_L1HDR(hdr)); /* * Skipped - drop L2ARC entry and mark the header as no * longer L2 eligibile. */ if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); uint64_t psize = HDR_GET_PSIZE(hdr); l2arc_hdr_arcstats_decrement(hdr); bytes_dropped += vdev_psize_to_asize(dev->l2ad_vdev, psize); (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); } /* * Allow ARC to begin reads and ghost list evictions to * this L2ARC entry. */ arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); } /* * Free the allocated abd buffers for writing the log blocks. * If the zio failed reclaim the allocated space and remove the * pointers to these log blocks from the log block pointer list * of the L2ARC device. */ while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) { abd_free(abd_buf->abd); zio_buf_free(abd_buf, sizeof (*abd_buf)); if (zio->io_error != 0) { lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list); /* * L2BLK_GET_PSIZE returns aligned size for log * blocks. */ uint64_t asize = L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop); bytes_dropped += asize; ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); } } list_destroy(&cb->l2wcb_abd_list); if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_writes_error); /* * Restore the lbps array in the header to its previous state. * If the list of log block pointers is empty, zero out the * log block pointers in the device header. */ lb_ptr_buf = list_head(&dev->l2ad_lbptr_list); for (int i = 0; i < 2; i++) { if (lb_ptr_buf == NULL) { /* * If the list is empty zero out the device * header. Otherwise zero out the second log * block pointer in the header. */ if (i == 0) { memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize); } else { memset(&l2dhdr->dh_start_lbps[i], 0, sizeof (l2arc_log_blkptr_t)); } break; } memcpy(&l2dhdr->dh_start_lbps[i], lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); lb_ptr_buf = list_next(&dev->l2ad_lbptr_list, lb_ptr_buf); } } ARCSTAT_BUMP(arcstat_l2_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); mutex_exit(&dev->l2ad_mtx); ASSERT(dev->l2ad_vdev != NULL); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); kmem_free(cb, sizeof (l2arc_write_callback_t)); } static int l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) { int ret; spa_t *spa = zio->io_spa; arc_buf_hdr_t *hdr = cb->l2rcb_hdr; blkptr_t *bp = zio->io_bp; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; /* * ZIL data is never be written to the L2ARC, so we don't need * special handling for its unique MAC storage. */ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * If the data was encrypted, decrypt it now. Note that * we must check the bp here and not the hdr, since the * hdr does not have its encryption parameters updated * until arc_read_done(). */ if (BP_IS_ENCRYPTED(bp)) { abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, ARC_HDR_USE_RESERVE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, HDR_GET_PSIZE(hdr), eabd, hdr->b_l1hdr.b_pabd, &no_crypt); if (ret != 0) { arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); goto error; } /* * If we actually performed decryption, replace b_pabd * with the decrypted data. Otherwise we can just throw * our decryption buffer away. */ if (!no_crypt) { arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = eabd; zio->io_abd = eabd; } else { arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); } } /* * If the L2ARC block was compressed, but ARC compression * is disabled we decompress the data into a new buffer and * replace the existing data. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, ARC_HDR_USE_RESERVE); void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr); goto error; } abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = cabd; zio->io_abd = cabd; zio->io_size = HDR_GET_LSIZE(hdr); } return (0); error: return (ret); } /* * A read to a cache device completed. Validate buffer contents before * handing over to the regular ARC routines. */ static void l2arc_read_done(zio_t *zio) { int tfm_error = 0; l2arc_read_callback_t *cb = zio->io_private; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; boolean_t valid_cksum; boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) && (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT)); ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); ASSERT3P(cb, !=, NULL); hdr = cb->l2rcb_hdr; ASSERT3P(hdr, !=, NULL); hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* * If the data was read into a temporary buffer, * move it and free the buffer. */ if (cb->l2rcb_abd != NULL) { ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); if (zio->io_error == 0) { if (using_rdata) { abd_copy(hdr->b_crypt_hdr.b_rabd, cb->l2rcb_abd, arc_hdr_size(hdr)); } else { abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd, arc_hdr_size(hdr)); } } /* * The following must be done regardless of whether * there was an error: * - free the temporary buffer * - point zio to the real ARC buffer * - set zio size accordingly * These are required because zio is either re-used for * an I/O of the block in the case of the error * or the zio is passed to arc_read_done() and it * needs real data. */ abd_free(cb->l2rcb_abd); zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); if (using_rdata) { ASSERT(HDR_HAS_RABD(hdr)); zio->io_abd = zio->io_orig_abd = hdr->b_crypt_hdr.b_rabd; } else { ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; } } ASSERT3P(zio->io_abd, !=, NULL); /* * Check this survived the L2ARC journey. */ ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd || (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd)); zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ zio->io_prop.zp_complevel = hdr->b_complevel; valid_cksum = arc_cksum_is_equal(hdr, zio); /* * b_rabd will always match the data as it exists on disk if it is * being used. Therefore if we are reading into b_rabd we do not * attempt to untransform the data. */ if (valid_cksum && !using_rdata) tfm_error = l2arc_untransform(zio, cb); if (valid_cksum && tfm_error == 0 && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); zio->io_private = hdr; arc_read_done(zio); } else { /* * Buffer didn't survive caching. Increment stats and * reissue to the original storage device. */ if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { zio->io_error = SET_ERROR(EIO); } if (!valid_cksum || tfm_error != 0) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* * If there's no waiter, issue an async i/o to the primary * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ if (zio->io_waiter == NULL) { zio_t *pio = zio_unique_parent(zio); void *abd = (using_rdata) ? hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd; ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio = zio_read(pio, zio->io_spa, zio->io_bp, abd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb); /* * Original ZIO will be freed, so we need to update * ARC header with the new ZIO pointer to be used * by zio_change_priority() in arc_read(). */ for (struct arc_callback *acb = hdr->b_l1hdr.b_acb; acb != NULL; acb = acb->acb_next) acb->acb_zio_head = zio; mutex_exit(hash_lock); zio_nowait(zio); } else { mutex_exit(hash_lock); } } kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * This is the list priority from which the L2ARC will search for pages to * cache. This is used within loops (0..3) to cycle through lists in the * desired order. This order can have a significant effect on cache * performance. * * Currently the metadata lists are hit first, MFU then MRU, followed by * the data lists. This function returns a locked list, and also returns * the lock pointer. */ static multilist_sublist_t * l2arc_sublist_lock(int list_num) { multilist_t *ml = NULL; unsigned int idx; ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES); switch (list_num) { case 0: ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; break; case 1: ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; break; case 2: ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; break; case 3: ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; break; default: return (NULL); } /* * Return a randomly-selected sublist. This is acceptable * because the caller feeds only a little bit of data for each * call (8MB). Subsequent calls will result in different * sublists being selected. */ idx = multilist_get_random_index(ml); return (multilist_sublist_lock(ml, idx)); } /* * Calculates the maximum overhead of L2ARC metadata log blocks for a given * L2ARC write size. l2arc_evict and l2arc_write_size need to include this * overhead in processing to make sure there is enough headroom available * when writing buffers. */ static inline uint64_t l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev) { if (dev->l2ad_log_entries == 0) { return (0); } else { uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT; uint64_t log_blocks = (log_entries + dev->l2ad_log_entries - 1) / dev->l2ad_log_entries; return (vdev_psize_to_asize(dev->l2ad_vdev, sizeof (l2arc_log_blk_phys_t)) * log_blocks); } } /* * Evict buffers from the device write hand to the distance specified in * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev; vdev_t *vd = dev->l2ad_vdev; boolean_t rerun; buflist = &dev->l2ad_buflist; top: rerun = B_FALSE; if (dev->l2ad_hand + distance > dev->l2ad_end) { /* * When there is no space to accommodate upcoming writes, * evict to the end. Then bump the write and evict hands * to the start and iterate. This iteration does not * happen indefinitely as we make sure in * l2arc_write_size() that when the write hand is reset, * the write size does not exceed the end of the device. */ rerun = B_TRUE; taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; } DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); if (!all) { /* * This check has to be placed after deciding whether to * iterate (rerun). */ if (dev->l2ad_first) { /* * This is the first sweep through the device. There is * nothing to evict. We have already trimmmed the * whole device. */ goto out; } else { /* * Trim the space to be evicted. */ if (vd->vdev_has_trim && dev->l2ad_evict < taddr && l2arc_trim_ahead > 0) { /* * We have to drop the spa_config lock because * vdev_trim_range() will acquire it. * l2ad_evict already accounts for the label * size. To prevent vdev_trim_ranges() from * adding it again, we subtract it from * l2ad_evict. */ spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev); vdev_trim_simple(vd, dev->l2ad_evict - VDEV_LABEL_START_SIZE, taddr - dev->l2ad_evict); spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev, RW_READER); } /* * When rebuilding L2ARC we retrieve the evict hand * from the header of the device. Of note, l2arc_evict() * does not actually delete buffers from the cache * device, but trimming may do so depending on the * hardware implementation. Thus keeping track of the * evict hand is useful. */ dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); } } retry: mutex_enter(&dev->l2ad_mtx); /* * We have to account for evicted log blocks. Run vdev_space_update() * on log blocks whose offset (in bytes) is before the evicted offset * (in bytes) by searching in the list of pointers to log blocks * present in the L2ARC device. */ for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf; lb_ptr_buf = lb_ptr_buf_prev) { lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf); /* L2BLK_GET_PSIZE returns aligned size for log blocks */ uint64_t asize = L2BLK_GET_PSIZE( (lb_ptr_buf->lb_ptr)->lbp_prop); /* * We don't worry about log blocks left behind (ie * lbp_payload_start < l2ad_hand) because l2arc_write_buffers() * will never write more than l2arc_evict() evicts. */ if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { break; } else { vdev_space_update(vd, -asize, 0, 0); ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); } } for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); ASSERT(!HDR_EMPTY(hdr)); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto retry; } /* * A header can't be on this list if it doesn't have L2 header. */ ASSERT(HDR_HAS_L2HDR(hdr)); /* Ensure this header has finished being written. */ ASSERT(!HDR_L2_WRITING(hdr)); ASSERT(!HDR_L2_WRITE_HEAD(hdr)); if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict || hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. */ mutex_exit(hash_lock); break; } if (!HDR_HAS_L1HDR(hdr)) { ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_lsize. */ arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); } else { ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); ARCSTAT_BUMP(arcstat_l2_evict_l1cached); /* * Invalidate issued or about to be issued * reads, since we may be about to write * over this location. */ if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); } arc_hdr_l2hdr_destroy(hdr); } mutex_exit(hash_lock); } mutex_exit(&dev->l2ad_mtx); out: /* * We need to check if we evict all buffers, otherwise we may iterate * unnecessarily. */ if (!all && rerun) { /* * Bump device hand to the device start if it is approaching the * end. l2arc_evict() has already evicted ahead for this case. */ dev->l2ad_hand = dev->l2ad_start; dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; goto top; } if (!all) { /* * In case of cache device removal (all) the following * assertions may be violated without functional consequences * as the device is about to be removed. */ ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); if (!dev->l2ad_first) ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); } } /* * Handle any abd transforms that might be required for writing to the L2ARC. * If successful, this function will always return an abd with the data * transformed as it is on disk in a new abd of asize bytes. */ static int l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, abd_t **abd_out) { int ret; void *tmp = NULL; abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd; enum zio_compress compress = HDR_GET_COMPRESS(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t size = arc_hdr_size(hdr); boolean_t ismd = HDR_ISTYPE_METADATA(hdr); boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); dsl_crypto_key_t *dck = NULL; uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 }; boolean_t no_crypt = B_FALSE; ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) || HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize); ASSERT3U(psize, <=, asize); /* * If this data simply needs its own buffer, we simply allocate it * and copy the data. This may be done to eliminate a dependency on a * shared buffer or to reallocate the buffer to match asize. */ if (HDR_HAS_RABD(hdr) && asize != psize) { ASSERT3U(asize, >=, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize); if (psize != asize) abd_zero_off(to_write, psize, asize - psize); goto out; } if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) && !HDR_ENCRYPTED(hdr)) { ASSERT3U(size, ==, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); if (size != asize) abd_zero_off(to_write, size, asize - size); goto out; } if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { /* * In some cases, we can wind up with size > asize, so * we need to opt for the larger allocation option here. * * (We also need abd_return_buf_copy in all cases because * it's an ASSERT() to modify the buffer before returning it * with arc_return_buf(), and all the compressors * write things before deciding to fail compression in nearly * every case.) */ uint64_t bufsize = MAX(size, asize); cabd = abd_alloc_for_io(bufsize, ismd); tmp = abd_borrow_buf(cabd, bufsize); psize = zio_compress_data(compress, to_write, &tmp, size, hdr->b_complevel); if (psize >= asize) { psize = HDR_GET_PSIZE(hdr); abd_return_buf_copy(cabd, tmp, bufsize); HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); to_write = cabd; abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); if (psize != asize) abd_zero_off(to_write, psize, asize - psize); goto encrypt; } ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); if (psize < asize) memset((char *)tmp + psize, 0, bufsize - psize); psize = HDR_GET_PSIZE(hdr); abd_return_buf_copy(cabd, tmp, bufsize); to_write = cabd; } encrypt: if (HDR_ENCRYPTED(hdr)) { eabd = abd_alloc_for_io(asize, ismd); /* * If the dataset was disowned before the buffer * made it to this point, the key to re-encrypt * it won't be available. In this case we simply * won't write the buffer to the L2ARC. */ ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj, FTAG, &dck); if (ret != 0) goto error; ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key, hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd, &no_crypt); if (ret != 0) goto error; if (no_crypt) abd_copy(eabd, to_write, psize); if (psize != asize) abd_zero_off(eabd, psize, asize - psize); /* assert that the MAC we got here matches the one we saved */ ASSERT0(memcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN)); spa_keystore_dsl_key_rele(spa, dck, FTAG); if (to_write == cabd) abd_free(cabd); to_write = eabd; } out: ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd); *abd_out = to_write; return (0); error: if (dck != NULL) spa_keystore_dsl_key_rele(spa, dck, FTAG); if (cabd != NULL) abd_free(cabd); if (eabd != NULL) abd_free(eabd); *abd_out = NULL; return (ret); } static void l2arc_blk_fetch_done(zio_t *zio) { l2arc_read_callback_t *cb; cb = zio->io_private; if (cb->l2rcb_abd != NULL) abd_free(cb->l2rcb_abd); kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * Find and write ARC buffers to the L2ARC device. * * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. * The headroom_boost is an in-out parameter used to maintain headroom boost * state between calls to this function. * * Returns the number of bytes actually written (which may be smaller than * the delta by which the device hand has changed due to alignment and the * writing of log blocks). */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *hdr, *hdr_prev, *head; uint64_t write_asize, write_psize, write_lsize, headroom; boolean_t full; l2arc_write_callback_t *cb = NULL; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; write_lsize = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); /* * Copy buffers for L2ARC writing. */ for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) { /* * If pass == 1 or 3, we cache MRU metadata and data * respectively. */ if (l2arc_mfuonly) { if (pass == 1 || pass == 3) continue; } multilist_sublist_t *mls = l2arc_sublist_lock(pass); uint64_t passed_sz = 0; VERIFY3P(mls, !=, NULL); /* * L2ARC fast warmup. * * Until the ARC is warm and starts to evict, read from the * head of the ARC lists rather than the tail. */ if (arc_warm == B_FALSE) hdr = multilist_sublist_head(mls); else hdr = multilist_sublist_tail(mls); headroom = target_sz * l2arc_headroom; if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; abd_t *to_write = NULL; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); else hdr_prev = multilist_sublist_prev(mls, hdr); hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * Skip this buffer rather than waiting. */ continue; } passed_sz += HDR_GET_LSIZE(hdr); if (l2arc_headroom != 0 && passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock); break; } if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); continue; } ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); ASSERT3U(arc_hdr_size(hdr), >, 0); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); /* * If the allocated size of this buffer plus the max * size for the pending log block exceeds the evicted * target size, terminate writing buffers for this run. */ if (write_asize + asize + sizeof (l2arc_log_blk_phys_t) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; } /* * We rely on the L1 portion of the header below, so * it's invalid for this header to have been evicted out * of the ghost cache, prior to being written out. The * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING); /* * If this header has b_rabd, we can use this since it * must always match the data exactly as it exists on * disk. Otherwise, the L2ARC can normally use the * hdr's data, but if we're sharing data between the * hdr and one of its bufs, L2ARC needs its own copy of * the data so that the ZIO below can't race with the * buf consumer. To ensure that this copy will be * available for the lifetime of the ZIO and be cleaned * up afterwards, we add it to the l2arc_free_on_write * queue. If we need to apply any transforms to the * data (compression, encryption) we will also need the * extra buffer. */ if (HDR_HAS_RABD(hdr) && psize == asize) { to_write = hdr->b_crypt_hdr.b_rabd; } else if ((HDR_COMPRESSION_ENABLED(hdr) || HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) && !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) && psize == asize) { to_write = hdr->b_l1hdr.b_pabd; } else { int ret; arc_buf_contents_t type = arc_buf_type(hdr); ret = l2arc_apply_transforms(spa, hdr, asize, &to_write); if (ret != 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); continue; } l2arc_free_abd_on_write(to_write, asize, type); } if (pio == NULL) { /* * Insert a dummy header on the buflist so * l2arc_write_done() can find where the * write buffers begin without searching. */ mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, head); mutex_exit(&dev->l2ad_mtx); cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; /* * Create a list to save allocated abd buffers * for l2arc_log_blk_commit(). */ list_create(&cb->l2wcb_abd_list, sizeof (l2arc_lb_abd_buf_t), offsetof(l2arc_lb_abd_buf_t, node)); pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); } hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_hits = 0; hdr->b_l2hdr.b_daddr = dev->l2ad_hand; hdr->b_l2hdr.b_arcs_state = hdr->b_l1hdr.b_state->arcs_state; arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); wzio = zio_write_phys(pio, dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, asize, to_write, ZIO_CHECKSUM_OFF, NULL, hdr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); write_lsize += HDR_GET_LSIZE(hdr); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); mutex_exit(hash_lock); /* * Append buf info to current log and commit if full. * arcstat_l2_{size,asize} kstats are updated * internally. */ if (l2arc_log_blk_insert(dev, hdr)) { /* * l2ad_hand will be adjusted in * l2arc_log_blk_commit(). */ write_asize += l2arc_log_blk_commit(dev, pio, cb); } zio_nowait(wzio); } multilist_sublist_unlock(mls); if (full == B_TRUE) break; } /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_lsize); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); /* * Although we did not write any buffers l2ad_evict may * have advanced. */ if (dev->l2ad_evict != l2dhdr->dh_evict) l2arc_dev_hdr_update(dev); return (0); } if (!dev->l2ad_first) ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; /* * Update the device header after the zio completes as * l2arc_write_done() may have updated the memory holding the log block * pointers in the device header. */ l2arc_dev_hdr_update(dev); return (write_asize); } static boolean_t l2arc_hdr_limit_reached(void) { int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size); return (arc_reclaim_needed() || (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100)); } /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. */ static __attribute__((noreturn)) void l2arc_feed_thread(void *unused) { (void) unused; callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); fstrans_cookie_t cookie; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&l2arc_feed_thr_lock); cookie = spl_fstrans_mark(); while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_idle(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, next); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; /* * Quick check for L2ARC devices. */ mutex_enter(&l2arc_dev_mtx); if (l2arc_ndev == 0) { mutex_exit(&l2arc_dev_mtx); continue; } mutex_exit(&l2arc_dev_mtx); begin = ddi_get_lbolt(); /* * This selects the next l2arc device to write to, and in * doing so the next spa to feed from: dev->l2ad_spa. This * will return NULL if there are now no l2arc devices or if * they are all faulted. * * If a device is returned, its spa's config lock is also * held to prevent device removal. l2arc_dev_get_next() * will grab and release l2arc_dev_mtx. */ if ((dev = l2arc_dev_get_next()) == NULL) continue; spa = dev->l2ad_spa; ASSERT3P(spa, !=, NULL); /* * If the pool is read-only then force the feed thread to * sleep a little longer. */ if (!spa_writeable(spa)) { next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; spa_config_exit(spa, SCL_L2ARC, dev); continue; } /* * Avoid contributing to memory pressure. */ if (l2arc_hdr_limit_reached()) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; } ARCSTAT_BUMP(arcstat_l2_feeds); size = l2arc_write_size(dev); /* * Evict L2ARC buffers that will be overwritten. */ l2arc_evict(dev, size, B_FALSE); /* * Write ARC buffers. */ wrote = l2arc_write_buffers(spa, dev, size); /* * Calculate interval between writes. */ next = l2arc_write_interval(begin, size, wrote); spa_config_exit(spa, SCL_L2ARC, dev); } spl_fstrans_unmark(cookie); l2arc_thread_exit = 0; cv_broadcast(&l2arc_feed_thr_cv); CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ thread_exit(); } boolean_t l2arc_vdev_present(vdev_t *vd) { return (l2arc_vdev_get(vd) != NULL); } /* * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if * the vdev_t isn't an L2ARC device. */ l2arc_dev_t * l2arc_vdev_get(vdev_t *vd) { l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; dev = list_next(l2arc_dev_list, dev)) { if (dev->l2ad_vdev == vd) break; } mutex_exit(&l2arc_dev_mtx); return (dev); } static void l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen) { l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; spa_t *spa = dev->l2ad_spa; /* * The L2ARC has to hold at least the payload of one log block for * them to be restored (persistent L2ARC). The payload of a log block * depends on the amount of its log entries. We always write log blocks * with 1022 entries. How many of them are committed or restored depends * on the size of the L2ARC device. Thus the maximum payload of * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device * is less than that, we reduce the amount of committed and restored * log entries per block so as to enable persistence. */ if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) { dev->l2ad_log_entries = 0; } else { dev->l2ad_log_entries = MIN((dev->l2ad_end - dev->l2ad_start) >> SPA_MAXBLOCKSHIFT, L2ARC_LOG_BLK_MAX_ENTRIES); } /* * Read the device header, if an error is returned do not rebuild L2ARC. */ if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) { /* * If we are onlining a cache device (vdev_reopen) that was * still present (l2arc_vdev_present()) and rebuild is enabled, * we should evict all ARC buffers and pointers to log blocks * and reclaim their space before restoring its contents to * L2ARC. */ if (reopen) { if (!l2arc_rebuild_enabled) { return; } else { l2arc_evict(dev, 0, B_TRUE); /* start a new log block */ dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; } } /* * Just mark the device as pending for a rebuild. We won't * be starting a rebuild in line here as it would block pool * import. Instead spa_load_impl will hand that off to an * async task which will call l2arc_spa_rebuild_start. */ dev->l2ad_rebuild = B_TRUE; } else if (spa_writeable(spa)) { /* * In this case TRIM the whole device if l2arc_trim_ahead > 0, * otherwise create a new header. We zero out the memory holding * the header to reset dh_start_lbps. If we TRIM the whole * device the new header will be written by * vdev_trim_l2arc_thread() at the end of the TRIM to update the * trim_state in the header too. When reading the header, if * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0 * we opt to TRIM the whole device again. */ if (l2arc_trim_ahead > 0) { dev->l2ad_trim_all = B_TRUE; } else { memset(l2dhdr, 0, l2dhdr_asize); l2arc_dev_hdr_update(dev); } } } /* * Add a vdev for use by the L2ARC. By this point the spa has already * validated the vdev and opened it. */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; uint64_t l2dhdr_asize; ASSERT(!l2arc_vdev_present(vd)); /* * Create a new l2arc device entry. */ adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; /* leave extra size for an l2arc device header */ l2dhdr_asize = adddev->l2ad_dev_hdr_asize = MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift); adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; adddev->l2ad_trim_all = B_FALSE; list_link_init(&adddev->l2ad_node); adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP); mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* * This is a list of all ARC buffers that are still valid on the * device. */ list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); /* * This is a list of pointers to log blocks that are still present * on the device. */ list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t), offsetof(l2arc_lb_ptr_buf_t, node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); zfs_refcount_create(&adddev->l2ad_alloc); zfs_refcount_create(&adddev->l2ad_lb_asize); zfs_refcount_create(&adddev->l2ad_lb_count); /* * Decide if dev is eligible for L2ARC rebuild or whole device * trimming. This has to happen before the device is added in the * cache device list and l2arc_dev_mtx is released. Otherwise * l2arc_feed_thread() might already start writing on the * device. */ l2arc_rebuild_dev(adddev, B_FALSE); /* * Add device to global list */ mutex_enter(&l2arc_dev_mtx); list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); } /* * Decide if a vdev is eligible for L2ARC rebuild, called from vdev_reopen() * in case of onlining a cache device. */ void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) { l2arc_dev_t *dev = NULL; dev = l2arc_vdev_get(vd); ASSERT3P(dev, !=, NULL); /* * In contrast to l2arc_add_vdev() we do not have to worry about * l2arc_feed_thread() invalidating previous content when onlining a * cache device. The device parameters (l2ad*) are not cleared when * offlining the device and writing new buffers will not invalidate * all previous content. In worst case only buffers that have not had * their log block written to the device will be lost. * When onlining the cache device (ie offline->online without exporting * the pool in between) this happens: * vdev_reopen() -> vdev_open() -> l2arc_rebuild_vdev() * | | * vdev_is_dead() = B_FALSE l2ad_rebuild = B_TRUE * During the time where vdev_is_dead = B_FALSE and until l2ad_rebuild * is set to B_TRUE we might write additional buffers to the device. */ l2arc_rebuild_dev(dev, reopen); } /* * Remove a vdev from the L2ARC. */ void l2arc_remove_vdev(vdev_t *vd) { l2arc_dev_t *remdev = NULL; /* * Find the device by vdev */ remdev = l2arc_vdev_get(vd); ASSERT3P(remdev, !=, NULL); /* * Cancel any ongoing or scheduled rebuild. */ mutex_enter(&l2arc_rebuild_thr_lock); if (remdev->l2ad_rebuild_began == B_TRUE) { remdev->l2ad_rebuild_cancel = B_TRUE; while (remdev->l2ad_rebuild == B_TRUE) cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); } mutex_exit(&l2arc_rebuild_thr_lock); /* * Remove device from global list */ mutex_enter(&l2arc_dev_mtx); list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); /* * Clear all buflists and ARC references. L2ARC device flush. */ l2arc_evict(remdev, 0, B_TRUE); list_destroy(&remdev->l2ad_buflist); ASSERT(list_is_empty(&remdev->l2ad_lbptr_list)); list_destroy(&remdev->l2ad_lbptr_list); mutex_destroy(&remdev->l2ad_mtx); zfs_refcount_destroy(&remdev->l2ad_alloc); zfs_refcount_destroy(&remdev->l2ad_lb_asize); zfs_refcount_destroy(&remdev->l2ad_lb_count); kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); vmem_free(remdev, sizeof (l2arc_dev_t)); } void l2arc_init(void) { l2arc_thread_exit = 0; l2arc_ndev = 0; mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; l2arc_free_on_write = &L2ARC_free_on_write; list_create(l2arc_dev_list, sizeof (l2arc_dev_t), offsetof(l2arc_dev_t, l2ad_node)); list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), offsetof(l2arc_data_free_t, l2df_list_node)); } void l2arc_fini(void) { mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_rebuild_thr_lock); cv_destroy(&l2arc_rebuild_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list); list_destroy(l2arc_free_on_write); } void l2arc_start(void) { if (!(spa_mode_global & SPA_MODE_WRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, TS_RUN, defclsyspri); } void l2arc_stop(void) { if (!(spa_mode_global & SPA_MODE_WRITE)) return; mutex_enter(&l2arc_feed_thr_lock); cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ l2arc_thread_exit = 1; while (l2arc_thread_exit != 0) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } /* * Punches out rebuild threads for the L2ARC devices in a spa. This should * be called after pool import from the spa async thread, since starting * these threads directly from spa_import() will make them part of the * "zpool import" context and delay process exit (and thus pool import). */ void l2arc_spa_rebuild_start(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * Locate the spa's l2arc devices and kick off rebuild threads. */ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { l2arc_dev_t *dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); if (dev == NULL) { /* Don't attempt a rebuild if the vdev is UNAVAIL */ continue; } mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) { dev->l2ad_rebuild_began = B_TRUE; (void) thread_create(NULL, 0, l2arc_dev_rebuild_thread, dev, 0, &p0, TS_RUN, minclsyspri); } mutex_exit(&l2arc_rebuild_thr_lock); } } /* * Main entry point for L2ARC rebuilding. */ static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg) { l2arc_dev_t *dev = arg; VERIFY(!dev->l2ad_rebuild_cancel); VERIFY(dev->l2ad_rebuild); (void) l2arc_rebuild(dev); mutex_enter(&l2arc_rebuild_thr_lock); dev->l2ad_rebuild_began = B_FALSE; dev->l2ad_rebuild = B_FALSE; mutex_exit(&l2arc_rebuild_thr_lock); thread_exit(); } /* * This function implements the actual L2ARC metadata rebuild. It: * starts reading the log block chain and restores each block's contents * to memory (reconstructing arc_buf_hdr_t's). * * Operation stops under any of the following conditions: * * 1) We reach the end of the log block chain. * 2) We encounter *any* error condition (cksum errors, io errors) */ static int l2arc_rebuild(l2arc_dev_t *dev) { vdev_t *vd = dev->l2ad_vdev; spa_t *spa = vd->vdev_spa; int err = 0; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; l2arc_log_blk_phys_t *this_lb, *next_lb; zio_t *this_io = NULL, *next_io = NULL; l2arc_log_blkptr_t lbps[2]; l2arc_lb_ptr_buf_t *lb_ptr_buf; boolean_t lock_held; this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP); next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP); /* * We prevent device removal while issuing reads to the device, * then during the rebuilding phases we drop this lock again so * that a spa_unload or device remove can be initiated - this is * safe, because the spa will signal us to stop before removing * our device and wait for us to stop. */ spa_config_enter(spa, SCL_L2ARC, vd, RW_READER); lock_held = B_TRUE; /* * Retrieve the persistent L2ARC device state. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start); dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr + L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop), dev->l2ad_start); dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time; vd->vdev_trim_state = l2dhdr->dh_trim_state; /* * In case the zfs module parameter l2arc_rebuild_enabled is false * we do not start the rebuild process. */ if (!l2arc_rebuild_enabled) goto out; /* Prepare the rebuild process */ memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps)); /* Start the rebuild process */ for (;;) { if (!l2arc_log_blkptr_valid(dev, &lbps[0])) break; if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1], this_lb, next_lb, this_io, &next_io)) != 0) goto out; /* * Our memory pressure valve. If the system is running low * on memory, rather than swamping memory with new ARC buf * hdrs, we opt not to rebuild the L2ARC. At this point, * however, we have already set up our L2ARC dev to chain in * new metadata log blocks, so the user may choose to offline/ * online the L2ARC dev at a later time (or re-import the pool) * to reconstruct it (when there's less memory pressure). */ if (l2arc_hdr_limit_reached()) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem); cmn_err(CE_NOTE, "System running low on memory, " "aborting L2ARC rebuild."); err = SET_ERROR(ENOMEM); goto out; } spa_config_exit(spa, SCL_L2ARC, vd); lock_held = B_FALSE; /* * Now that we know that the next_lb checks out alright, we * can start reconstruction from this log block. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); l2arc_log_blk_restore(dev, this_lb, asize); /* * log block restored, include its pointer in the list of * pointers to log blocks present in the L2ARC device. */ lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); memcpy(lb_ptr_buf->lb_ptr, &lbps[0], sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf); ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_count); zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); vdev_space_update(vd, asize, 0, 0); /* * Protection against loops of log blocks: * * l2ad_hand l2ad_evict * V V * l2ad_start |=======================================| l2ad_end * -----|||----|||---|||----||| * (3) (2) (1) (0) * ---|||---|||----|||---||| * (7) (6) (5) (4) * * In this situation the pointer of log block (4) passes * l2arc_log_blkptr_valid() but the log block should not be * restored as it is overwritten by the payload of log block * (0). Only log blocks (0)-(3) should be restored. We check * whether l2ad_evict lies in between the payload starting * offset of the next log block (lbps[1].lbp_payload_start) * and the payload starting offset of the present log block * (lbps[0].lbp_payload_start). If true and this isn't the * first pass, we are looping from the beginning and we should * stop. */ if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, lbps[0].lbp_payload_start, dev->l2ad_evict) && !dev->l2ad_first) goto out; kpreempt(KPREEMPT_SYNC); for (;;) { mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild_cancel) { dev->l2ad_rebuild = B_FALSE; cv_signal(&l2arc_rebuild_thr_cv); mutex_exit(&l2arc_rebuild_thr_lock); err = SET_ERROR(ECANCELED); goto out; } mutex_exit(&l2arc_rebuild_thr_lock); if (spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) { lock_held = B_TRUE; break; } /* * L2ARC config lock held by somebody in writer, * possibly due to them trying to remove us. They'll * likely to want us to shut down, so after a little * delay, we check l2ad_rebuild_cancel and retry * the lock again. */ delay(1); } /* * Continue with the next log block. */ lbps[0] = lbps[1]; lbps[1] = this_lb->lb_prev_lbp; PTR_SWAP(this_lb, next_lb); this_io = next_io; next_io = NULL; } if (this_io != NULL) l2arc_log_blk_fetch_abort(this_io); out: if (next_io != NULL) l2arc_log_blk_fetch_abort(next_io); vmem_free(this_lb, sizeof (*this_lb)); vmem_free(next_lb, sizeof (*next_lb)); if (!l2arc_rebuild_enabled) { spa_history_log_internal(spa, "L2ARC rebuild", NULL, "disabled"); } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_success); spa_history_log_internal(spa, "L2ARC rebuild", NULL, "successful, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) { /* * No error but also nothing restored, meaning the lbps array * in the device header points to invalid/non-present log * blocks. Reset the header. */ spa_history_log_internal(spa, "L2ARC rebuild", NULL, "no valid log blocks"); memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize); l2arc_dev_hdr_update(dev); } else if (err == ECANCELED) { /* * In case the rebuild was canceled do not log to spa history * log as the pool may be in the process of being removed. */ zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } else if (err != 0) { spa_history_log_internal(spa, "L2ARC rebuild", NULL, "aborted, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } if (lock_held) spa_config_exit(spa, SCL_L2ARC, vd); return (err); } /* * Attempts to read the device header on the provided L2ARC device and writes * it to `hdr'. On success, this function returns 0, otherwise the appropriate * error code is returned. */ static int l2arc_dev_hdr_read(l2arc_dev_t *dev) { int err; uint64_t guid; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; abd_t *abd; guid = spa_guid(dev->l2ad_vdev->vdev_spa); abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SPECULATIVE, B_FALSE)); abd_free(abd); if (err != 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors); zfs_dbgmsg("L2ARC IO error (%d) while reading device header, " "vdev guid: %llu", err, (u_longlong_t)dev->l2ad_vdev->vdev_guid); return (err); } if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr)); if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC || l2dhdr->dh_spa_guid != guid || l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid || l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION || l2dhdr->dh_log_entries != dev->l2ad_log_entries || l2dhdr->dh_end != dev->l2ad_end || !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, l2dhdr->dh_evict) || (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE && l2arc_trim_ahead > 0)) { /* * Attempt to rebuild a device containing no actual dev hdr * or containing a header from some other pool or from another * version of persistent L2ARC. */ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported); return (SET_ERROR(ENOTSUP)); } return (0); } /* * Reads L2ARC log blocks from storage and validates their contents. * * This function implements a simple fetcher to make sure that while * we're processing one buffer the L2ARC is already fetching the next * one in the chain. * * The arguments this_lp and next_lp point to the current and next log block * address in the block chain. Similarly, this_lb and next_lb hold the * l2arc_log_blk_phys_t's of the current and next L2ARC blk. * * The `this_io' and `next_io' arguments are used for block fetching. * When issuing the first blk IO during rebuild, you should pass NULL for * `this_io'. This function will then issue a sync IO to read the block and * also issue an async IO to fetch the next block in the block chain. The * fetched IO is returned in `next_io'. On subsequent calls to this * function, pass the value returned in `next_io' from the previous call * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO. * Prior to the call, you should initialize your `next_io' pointer to be * NULL. If no fetch IO was issued, the pointer is left set at NULL. * * On success, this function returns 0, otherwise it returns an appropriate * error code. On error the fetching IO is aborted and cleared before * returning from this function. Therefore, if we return `success', the * caller can assume that we have taken care of cleanup of fetch IOs. */ static int l2arc_log_blk_read(l2arc_dev_t *dev, const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp, l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, zio_t *this_io, zio_t **next_io) { int err = 0; zio_cksum_t cksum; abd_t *abd = NULL; uint64_t asize; ASSERT(this_lbp != NULL && next_lbp != NULL); ASSERT(this_lb != NULL && next_lb != NULL); ASSERT(next_io != NULL && *next_io == NULL); ASSERT(l2arc_log_blkptr_valid(dev, this_lbp)); /* * Check to see if we have issued the IO for this log block in a * previous run. If not, this is the first call, so issue it now. */ if (this_io == NULL) { this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp, this_lb); } /* * Peek to see if we can start issuing the next IO immediately. */ if (l2arc_log_blkptr_valid(dev, next_lbp)) { /* * Start issuing IO for the next log block early - this * should help keep the L2ARC device busy while we * decompress and restore this log block. */ *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp, next_lb); } /* Wait for the IO to read this log block to complete */ if ((err = zio_wait(this_io)) != 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors); zfs_dbgmsg("L2ARC IO error (%d) while reading log block, " "offset: %llu, vdev guid: %llu", err, (u_longlong_t)this_lbp->lbp_daddr, (u_longlong_t)dev->l2ad_vdev->vdev_guid); goto cleanup; } /* * Make sure the buffer checks out. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop); fletcher_4_native(this_lb, asize, NULL, &cksum); if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors); zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, " "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu", (u_longlong_t)this_lbp->lbp_daddr, (u_longlong_t)dev->l2ad_vdev->vdev_guid, (u_longlong_t)dev->l2ad_hand, (u_longlong_t)dev->l2ad_evict); err = SET_ERROR(ECKSUM); goto cleanup; } /* Now we can take our time decoding this buffer */ switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) { case ZIO_COMPRESS_OFF: break; case ZIO_COMPRESS_LZ4: abd = abd_alloc_for_io(asize, B_TRUE); abd_copy_from_buf_off(abd, this_lb, 0, asize); if ((err = zio_decompress_data( L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) { err = SET_ERROR(EINVAL); goto cleanup; } break; default: err = SET_ERROR(EINVAL); goto cleanup; } if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) byteswap_uint64_array(this_lb, sizeof (*this_lb)); if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) { err = SET_ERROR(EINVAL); goto cleanup; } cleanup: /* Abort an in-flight fetch I/O in case of error */ if (err != 0 && *next_io != NULL) { l2arc_log_blk_fetch_abort(*next_io); *next_io = NULL; } if (abd != NULL) abd_free(abd); return (err); } /* * Restores the payload of a log block to ARC. This creates empty ARC hdr * entries which only contain an l2arc hdr, essentially restoring the * buffers to their L2ARC evicted state. This function also updates space * usage on the L2ARC vdev to make sure it tracks restored buffers. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, uint64_t lb_asize) { uint64_t size = 0, asize = 0; uint64_t log_entries = dev->l2ad_log_entries; /* * Usually arc_adapt() is called only for data, not headers, but * since we may allocate significant amount of memory here, let ARC * grow its arc_c. */ arc_adapt(log_entries * HDR_L2ONLY_SIZE); for (int i = log_entries - 1; i >= 0; i--) { /* * Restore goes in the reverse temporal direction to preserve * correct temporal ordering of buffers in the l2ad_buflist. * l2arc_hdr_restore also does a list_insert_tail instead of * list_insert_head on the l2ad_buflist: * * LIST l2ad_buflist LIST * HEAD <------ (time) ------ TAIL * direction +-----+-----+-----+-----+-----+ direction * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild * fill +-----+-----+-----+-----+-----+ * ^ ^ * | | * | | * l2arc_feed_thread l2arc_rebuild * will place new bufs here restores bufs here * * During l2arc_rebuild() the device is not used by * l2arc_feed_thread() as dev->l2ad_rebuild is set to true. */ size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop); asize += vdev_psize_to_asize(dev->l2ad_vdev, L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop)); l2arc_hdr_restore(&lb->lb_entries[i], dev); } /* * Record rebuild stats: * size Logical size of restored buffers in the L2ARC * asize Aligned size of restored buffers in the L2ARC */ ARCSTAT_INCR(arcstat_l2_rebuild_size, size); ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize); ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries); ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize); ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize); ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks); } /* * Restores a single ARC buf hdr from a log entry. The ARC buffer is put * into a state indicating that it has been evicted to L2ARC. */ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) { arc_buf_hdr_t *hdr, *exists; kmutex_t *hash_lock; arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop); uint64_t asize; /* * Do all the allocation before grabbing any locks, this lets us * sleep if memory is full and we don't have to deal with failed * allocations. */ hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type, dev, le->le_dva, le->le_daddr, L2BLK_GET_PSIZE((le)->le_prop), le->le_birth, L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel, L2BLK_GET_PROTECTED((le)->le_prop), L2BLK_GET_PREFETCH((le)->le_prop), L2BLK_GET_STATE((le)->le_prop)); asize = vdev_psize_to_asize(dev->l2ad_vdev, L2BLK_GET_PSIZE((le)->le_prop)); /* * vdev_space_update() has to be called before arc_hdr_destroy() to * avoid underflow since the latter also calls vdev_space_update(). */ l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, hdr); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); mutex_exit(&dev->l2ad_mtx); exists = buf_hash_insert(hdr, &hash_lock); if (exists) { /* Buffer was already cached, no need to restore it. */ arc_hdr_destroy(hdr); /* * If the buffer is already cached, check whether it has * L2ARC metadata. If not, enter them and update the flag. * This is important is case of onlining a cache device, since * we previously evicted all L2ARC metadata from ARC. */ if (!HDR_HAS_L2HDR(exists)) { arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR); exists->b_l2hdr.b_dev = dev; exists->b_l2hdr.b_daddr = le->le_daddr; exists->b_l2hdr.b_arcs_state = L2BLK_GET_STATE((le)->le_prop); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, exists); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(exists), exists); mutex_exit(&dev->l2ad_mtx); l2arc_hdr_arcstats_increment(exists); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); } ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); } mutex_exit(hash_lock); } /* * Starts an asynchronous read IO to read a log block. This is used in log * block reconstruction to start reading the next block before we are done * decoding and reconstructing the current block, to keep the l2arc device * nice and hot with read IO to process. * The returned zio will contain a newly allocated memory buffers for the IO * data which should then be freed by the caller once the zio is no longer * needed (i.e. due to it having completed). If you wish to abort this * zio, you should do so using l2arc_log_blk_fetch_abort, which takes * care of disposing of the allocated buffers correctly. */ static zio_t * l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, l2arc_log_blk_phys_t *lb) { uint32_t asize; zio_t *pio; l2arc_read_callback_t *cb; /* L2BLK_GET_PSIZE returns aligned size for log blocks */ asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); ASSERT(asize <= sizeof (l2arc_log_blk_phys_t)); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_abd = abd_get_from_buf(lb, asize); pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); return (pio); } /* * Aborts a zio returned from l2arc_log_blk_fetch and frees the data * buffers allocated for it. */ static void l2arc_log_blk_fetch_abort(zio_t *zio) { (void) zio_wait(zio); } /* * Creates a zio to update the device header on an l2arc device. */ void l2arc_dev_hdr_update(l2arc_dev_t *dev) { l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; abd_t *abd; int err; VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER)); l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC; l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION; l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid; l2dhdr->dh_log_entries = dev->l2ad_log_entries; l2dhdr->dh_evict = dev->l2ad_evict; l2dhdr->dh_start = dev->l2ad_start; l2dhdr->dh_end = dev->l2ad_end; l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize); l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count); l2dhdr->dh_flags = 0; l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time; l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state; if (dev->l2ad_first) l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE)); abd_free(abd); if (err != 0) { zfs_dbgmsg("L2ARC IO error (%d) while writing device header, " "vdev guid: %llu", err, (u_longlong_t)dev->l2ad_vdev->vdev_guid); } } /* * Commits a log block to the L2ARC device. This routine is invoked from * l2arc_write_buffers when the log block fills up. * This function allocates some memory to temporarily hold the serialized * buffer to be written. This is then released in l2arc_write_done. */ static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; uint64_t psize, asize; zio_t *wzio; l2arc_lb_abd_buf_t *abd_buf; uint8_t *tmpbuf = NULL; l2arc_lb_ptr_buf_t *lb_ptr_buf; VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries); abd_buf = zio_buf_alloc(sizeof (*abd_buf)); abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb)); lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); /* link the buffer into the block chain */ lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1]; lb->lb_magic = L2ARC_LOG_BLK_MAGIC; /* * l2arc_log_blk_commit() may be called multiple times during a single * l2arc_write_buffers() call. Save the allocated abd buffers in a list * so we can free them in l2arc_write_done() later on. */ list_insert_tail(&cb->l2wcb_abd_list, abd_buf); /* try to compress the buffer */ psize = zio_compress_data(ZIO_COMPRESS_LZ4, abd_buf->abd, (void **) &tmpbuf, sizeof (*lb), 0); /* a log block is never entirely zero */ ASSERT(psize != 0); asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); ASSERT(asize <= sizeof (*lb)); /* * Update the start log block pointer in the device header to point * to the log block we're about to write. */ l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0]; l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand; l2dhdr->dh_start_lbps[0].lbp_payload_asize = dev->l2ad_log_blk_payload_asize; l2dhdr->dh_start_lbps[0].lbp_payload_start = dev->l2ad_log_blk_payload_start; L2BLK_SET_LSIZE( (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb)); L2BLK_SET_PSIZE( (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize); L2BLK_SET_CHECKSUM( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_CHECKSUM_FLETCHER_4); if (asize < sizeof (*lb)) { /* compression succeeded */ memset(tmpbuf + psize, 0, asize - psize); L2BLK_SET_COMPRESS( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_COMPRESS_LZ4); } else { /* compression failed */ memcpy(tmpbuf, lb, sizeof (*lb)); L2BLK_SET_COMPRESS( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_COMPRESS_OFF); } /* checksum what we're about to write */ fletcher_4_native(tmpbuf, asize, NULL, &l2dhdr->dh_start_lbps[0].lbp_cksum); abd_free(abd_buf->abd); /* perform the write itself */ abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb)); abd_take_ownership_of_buf(abd_buf->abd, B_TRUE); wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); (void) zio_nowait(wzio); dev->l2ad_hand += asize; /* * Include the committed log block's pointer in the list of pointers * to log blocks present in the L2ARC device. */ memcpy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[0], sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf); ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_count); zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); /* bump the kstats */ ARCSTAT_INCR(arcstat_l2_write_bytes, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_writes); ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize); ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, dev->l2ad_log_blk_payload_asize / asize); /* start a new log block */ dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; return (asize); } /* * Validates an L2ARC log block address to make sure that it can be read * from the provided L2ARC device. */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) { /* L2BLK_GET_PSIZE returns aligned size for log blocks */ uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); uint64_t end = lbp->lbp_daddr + asize - 1; uint64_t start = lbp->lbp_payload_start; boolean_t evicted = B_FALSE; /* * A log block is valid if all of the following conditions are true: * - it fits entirely (including its payload) between l2ad_start and * l2ad_end * - it has a valid size * - neither the log block itself nor part of its payload was evicted * by l2arc_evict(): * * l2ad_hand l2ad_evict * | | lbp_daddr * | start | | end * | | | | | * V V V V V * l2ad_start ============================================ l2ad_end * --------------------------|||| * ^ ^ * | log block * payload */ evicted = l2arc_range_check_overlap(start, end, dev->l2ad_hand) || l2arc_range_check_overlap(start, end, dev->l2ad_evict) || l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) || l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end); return (start >= dev->l2ad_start && end <= dev->l2ad_end && asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) && (!evicted || dev->l2ad_first)); } /* * Inserts ARC buffer header `hdr' into the current L2ARC log block on * the device. The buffer being inserted must be present in L2ARC. * Returns B_TRUE if the L2ARC log block is full and needs to be committed * to L2ARC, or B_FALSE if it still has room for more ARC buffers. */ static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; l2arc_log_ent_phys_t *le; if (dev->l2ad_log_entries == 0) return (B_FALSE); int index = dev->l2ad_log_ent_idx++; ASSERT3S(index, <, dev->l2ad_log_entries); ASSERT(HDR_HAS_L2HDR(hdr)); le = &lb->lb_entries[index]; memset(le, 0, sizeof (*le)); le->le_dva = hdr->b_dva; le->le_birth = hdr->b_birth; le->le_daddr = hdr->b_l2hdr.b_daddr; if (index == 0) dev->l2ad_log_blk_payload_start = le->le_daddr; L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr)); L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr)); L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr)); le->le_complevel = hdr->b_complevel; L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state); dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, HDR_GET_PSIZE(hdr)); return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries); } /* * Checks whether a given L2ARC device address sits in a time-sequential * range. The trick here is that the L2ARC is a rotary buffer, so we can't * just do a range comparison, we need to handle the situation in which the * range wraps around the end of the L2ARC device. Arguments: * bottom -- Lower end of the range to check (written to earlier). * top -- Upper end of the range to check (written to later). * check -- The address for which we want to determine if it sits in * between the top and bottom. * * The 3-way conditional below represents the following cases: * * bottom < top : Sequentially ordered case: * --------+-------------------+ * | (overlap here?) | * L2ARC dev V V * |---------------============--------------| * * bottom > top: Looped-around case: * --------+------------------+ * | (overlap here?) | * L2ARC dev V V * |===============---------------===========| * ^ ^ * | (or here?) | * +---------------+--------- * * top == bottom : Just a single address comparison. */ boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check) { if (bottom < top) return (bottom <= check && check <= top); else if (bottom > top) return (check <= top || bottom <= check); else return (check == top); } EXPORT_SYMBOL(arc_buf_size); EXPORT_SYMBOL(arc_write); EXPORT_SYMBOL(arc_read); EXPORT_SYMBOL(arc_buf_info); EXPORT_SYMBOL(arc_getbuf_func); EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min, spl_param_get_u64, ZMOD_RW, "Minimum ARC size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max, spl_param_get_u64, ZMOD_RW, "Maximum ARC size in bytes"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_balance, UINT, ZMOD_RW, "Balance between metadata and data on ghost hits."); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int, param_get_uint, ZMOD_RW, "Seconds before growing ARC size"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int, param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW, "Percent of pagecache to reclaim ARC to"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, UINT, ZMOD_RD, "Target average block size"); ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW, "Disable compressed ARC buffers"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int, param_get_uint, ZMOD_RW, "Min life of prefetch block in ms"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms, param_set_arc_int, param_get_uint, ZMOD_RW, "Min life of prescient prefetched block in ms"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, U64, ZMOD_RW, "Max write bytes per interval"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, U64, ZMOD_RW, "Extra write bytes during device warmup"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, U64, ZMOD_RW, "Number of max device writes to precache"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, U64, ZMOD_RW, "Compressed l2arc_headroom multiplier"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, U64, ZMOD_RW, "TRIM ahead L2ARC write size multiplier"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, U64, ZMOD_RW, "Seconds between L2ARC writing"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, U64, ZMOD_RW, "Min feed interval in milliseconds"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW, "Skip caching prefetched buffers"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW, "Turbo L2ARC warmup"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW, "No reads during writes"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, UINT, ZMOD_RW, "Percent of ARC size allowed for L2ARC-only headers"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW, "Rebuild the L2ARC when importing a pool"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, U64, ZMOD_RW, "Min size in bytes to write rebuild log blocks in L2ARC"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW, "Cache only MFU data from ARC into L2ARC"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW, "Exclude dbufs on special vdevs from being cached to L2ARC if set."); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int, param_get_uint, ZMOD_RW, "System free memory I/O throttle in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_u64, spl_param_get_u64, ZMOD_RW, "System free memory target size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_u64, spl_param_get_u64, ZMOD_RW, "Minimum bytes of dnodes in ARC"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent, param_set_arc_int, param_get_uint, ZMOD_RW, "Percent of ARC meta buffers for dnodes"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, UINT, ZMOD_RW, "Percentage of excess dnodes to try to unpin"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW, "When full, ARC allocation waits for eviction of this % of alloc size"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW, "The number of headers to evict per sublist before moving to the next"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW, "Number of arc_prune threads"); diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c index 8635403d6ad4..e0d4a6a63508 100644 --- a/sys/contrib/openzfs/module/zfs/metaslab.c +++ b/sys/contrib/openzfs/module/zfs/metaslab.c @@ -1,6291 +1,6284 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #define GANG_ALLOCATION(flags) \ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) /* * Metaslab granularity, in bytes. This is roughly similar to what would be * referred to as the "stripe size" in traditional RAID arrays. In normal * operation, we will try to write this amount of data to each disk before * moving on to the next top-level vdev. */ static uint64_t metaslab_aliquot = 1024 * 1024; /* * For testing, make some blocks above a certain size be gang blocks. */ uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* * Of blocks of size >= metaslab_force_ganging, actually gang them this often. */ uint_t metaslab_force_ganging_pct = 3; /* * In pools where the log space map feature is not enabled we touch * multiple metaslabs (and their respective space maps) with each * transaction group. Thus, we benefit from having a small space map * block size since it allows us to issue more I/O operations scattered * around the disk. So a sane default for the space map block size * is 8~16K. */ int zfs_metaslab_sm_blksz_no_log = (1 << 14); /* * When the log space map feature is enabled, we accumulate a lot of * changes per metaslab that are flushed once in a while so we benefit * from a bigger block size like 128K for the metaslab space maps. */ int zfs_metaslab_sm_blksz_with_log = (1 << 17); /* * The in-core space map representation is more compact than its on-disk form. * The zfs_condense_pct determines how much more compact the in-core * space map representation must be before we compact it on-disk. * Values should be greater than or equal to 100. */ uint_t zfs_condense_pct = 200; /* * Condensing a metaslab is not guaranteed to actually reduce the amount of * space used on disk. In particular, a space map uses data in increments of * MAX(1 << ashift, space_map_blksz), so a metaslab might use the * same number of blocks after condensing. Since the goal of condensing is to * reduce the number of IOPs required to read the space map, we only want to * condense when we can be sure we will reduce the number of blocks used by the * space map. Unfortunately, we cannot precisely compute whether or not this is * the case in metaslab_should_condense since we are holding ms_lock. Instead, * we apply the following heuristic: do not condense a spacemap unless the * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold * blocks. */ static const int zfs_metaslab_condense_block_threshold = 4; /* * The zfs_mg_noalloc_threshold defines which metaslab groups should * be eligible for allocation. The value is defined as a percentage of * free space. Metaslab groups that have more free space than * zfs_mg_noalloc_threshold are always eligible for allocations. Once * a metaslab group's free space is less than or equal to the * zfs_mg_noalloc_threshold the allocator will avoid allocating to that * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. * Once all groups in the pool reach zfs_mg_noalloc_threshold then all * groups are allowed to accept allocations. Gang blocks are always * eligible to allocate on any metaslab group. The default value of 0 means * no metaslab group will be excluded based on this criterion. */ static uint_t zfs_mg_noalloc_threshold = 0; /* * Metaslab groups are considered eligible for allocations if their * fragmentation metric (measured as a percentage) is less than or * equal to zfs_mg_fragmentation_threshold. If a metaslab group * exceeds this threshold then it will be skipped unless all metaslab * groups within the metaslab class have also crossed this threshold. * * This tunable was introduced to avoid edge cases where we continue * allocating from very fragmented disks in our pool while other, less * fragmented disks, exists. On the other hand, if all disks in the * pool are uniformly approaching the threshold, the threshold can * be a speed bump in performance, where we keep switching the disks * that we allocate from (e.g. we allocate some segments from disk A * making it bypassing the threshold while freeing segments from disk * B getting its fragmentation below the threshold). * * Empirically, we've seen that our vdev selection for allocations is * good enough that fragmentation increases uniformly across all vdevs * the majority of the time. Thus we set the threshold percentage high * enough to avoid hitting the speed bump on pools that are being pushed * to the edge. */ static uint_t zfs_mg_fragmentation_threshold = 95; /* * Allow metaslabs to keep their active state as long as their fragmentation * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An * active metaslab that exceeds this threshold will no longer keep its active * status allowing better metaslabs to be selected. */ static uint_t zfs_metaslab_fragmentation_threshold = 70; /* * When set will load all metaslabs when pool is first opened. */ int metaslab_debug_load = B_FALSE; /* * When set will prevent metaslabs from being unloaded. */ static int metaslab_debug_unload = B_FALSE; /* * Minimum size which forces the dynamic allocator to change * it's allocation strategy. Once the space map cannot satisfy * an allocation of this size then it switches to using more * aggressive strategy (i.e search by size rather than offset). */ uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; /* * The minimum free space, in percent, which must be available * in a space map to continue allocations in a first-fit fashion. * Once the space map's free space drops below this level we dynamically * switch to using best-fit allocations. */ uint_t metaslab_df_free_pct = 4; /* * Maximum distance to search forward from the last offset. Without this * limit, fragmented pools can see >100,000 iterations and * metaslab_block_picker() becomes the performance limiting factor on * high-performance storage. * * With the default setting of 16MB, we typically see less than 500 * iterations, even with very fragmented, ashift=9 pools. The maximum number * of iterations possible is: * metaslab_df_max_search / (2 * (1<60KB (but fewer segments in this * bucket, and therefore a lower weight). */ static uint_t zfs_metaslab_find_max_tries = 100; static uint64_t metaslab_weight(metaslab_t *, boolean_t); static void metaslab_set_fragmentation(metaslab_t *, boolean_t); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); static void metaslab_passivate(metaslab_t *msp, uint64_t weight); static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); static unsigned int metaslab_idx_func(multilist_t *, void *); static void metaslab_evict(metaslab_t *, uint64_t); static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg); kmem_cache_t *metaslab_alloc_trace_cache; typedef struct metaslab_stats { kstat_named_t metaslabstat_trace_over_limit; kstat_named_t metaslabstat_reload_tree; kstat_named_t metaslabstat_too_many_tries; kstat_named_t metaslabstat_try_hard; } metaslab_stats_t; static metaslab_stats_t metaslab_stats = { { "trace_over_limit", KSTAT_DATA_UINT64 }, { "reload_tree", KSTAT_DATA_UINT64 }, { "too_many_tries", KSTAT_DATA_UINT64 }, { "try_hard", KSTAT_DATA_UINT64 }, }; #define METASLABSTAT_BUMP(stat) \ atomic_inc_64(&metaslab_stats.stat.value.ui64); static kstat_t *metaslab_ksp; void metaslab_stat_init(void) { ASSERT(metaslab_alloc_trace_cache == NULL); metaslab_alloc_trace_cache = kmem_cache_create( "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 0, NULL, NULL, NULL, NULL, NULL, 0); metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats", "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (metaslab_ksp != NULL) { metaslab_ksp->ks_data = &metaslab_stats; kstat_install(metaslab_ksp); } } void metaslab_stat_fini(void) { if (metaslab_ksp != NULL) { kstat_delete(metaslab_ksp); metaslab_ksp = NULL; } kmem_cache_destroy(metaslab_alloc_trace_cache); metaslab_alloc_trace_cache = NULL; } /* * ========================================================================== * Metaslab classes * ========================================================================== */ metaslab_class_t * metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops) { metaslab_class_t *mc; mc = kmem_zalloc(offsetof(metaslab_class_t, mc_allocator[spa->spa_alloc_count]), KM_SLEEP); mc->mc_spa = spa; mc->mc_ops = ops; mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t), offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func); for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; mca->mca_rotor = NULL; zfs_refcount_create_tracked(&mca->mca_alloc_slots); } return (mc); } void metaslab_class_destroy(metaslab_class_t *mc) { spa_t *spa = mc->mc_spa; ASSERT(mc->mc_alloc == 0); ASSERT(mc->mc_deferred == 0); ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; ASSERT(mca->mca_rotor == NULL); zfs_refcount_destroy(&mca->mca_alloc_slots); } mutex_destroy(&mc->mc_lock); multilist_destroy(&mc->mc_metaslab_txg_list); kmem_free(mc, offsetof(metaslab_class_t, mc_allocator[spa->spa_alloc_count])); } int metaslab_class_validate(metaslab_class_t *mc) { metaslab_group_t *mg; vdev_t *vd; /* * Must hold one of the spa_config locks. */ ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); if ((mg = mc->mc_allocator[0].mca_rotor) == NULL) return (0); do { vd = mg->mg_vd; ASSERT(vd->vdev_mg != NULL); ASSERT3P(vd->vdev_top, ==, vd); ASSERT3P(mg->mg_class, ==, mc); ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); } while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor); return (0); } static void metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) { atomic_add_64(&mc->mc_alloc, alloc_delta); atomic_add_64(&mc->mc_deferred, defer_delta); atomic_add_64(&mc->mc_space, space_delta); atomic_add_64(&mc->mc_dspace, dspace_delta); } uint64_t metaslab_class_get_alloc(metaslab_class_t *mc) { return (mc->mc_alloc); } uint64_t metaslab_class_get_deferred(metaslab_class_t *mc) { return (mc->mc_deferred); } uint64_t metaslab_class_get_space(metaslab_class_t *mc) { return (mc->mc_space); } uint64_t metaslab_class_get_dspace(metaslab_class_t *mc) { return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); } void metaslab_class_histogram_verify(metaslab_class_t *mc) { spa_t *spa = mc->mc_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t *mc_hist; int i; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); mutex_enter(&mc->mc_lock); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = vdev_get_mg(tvd, mc); /* * Skip any holes, uninitialized top-levels, or * vdevs that are not in this metalab class. */ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } IMPLY(mg == mg->mg_vd->vdev_log_mg, mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) mc_hist[i] += mg->mg_histogram[i]; } for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); } mutex_exit(&mc->mc_lock); kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); } /* * Calculate the metaslab class's fragmentation metric. The metric * is weighted based on the space contribution of each metaslab group. * The return value will be a number between 0 and 100 (inclusive), or * ZFS_FRAG_INVALID if the metric has not been set. See comment above the * zfs_frag_table for more information about the metric. */ uint64_t metaslab_class_fragmentation(metaslab_class_t *mc) { vdev_t *rvd = mc->mc_spa->spa_root_vdev; uint64_t fragmentation = 0; spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; /* * Skip any holes, uninitialized top-levels, * or vdevs that are not in this metalab class. */ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } /* * If a metaslab group does not contain a fragmentation * metric then just bail out. */ if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (ZFS_FRAG_INVALID); } /* * Determine how much this metaslab_group is contributing * to the overall pool fragmentation metric. */ fragmentation += mg->mg_fragmentation * metaslab_group_get_space(mg); } fragmentation /= metaslab_class_get_space(mc); ASSERT3U(fragmentation, <=, 100); spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (fragmentation); } /* * Calculate the amount of expandable space that is available in * this metaslab class. If a device is expanded then its expandable * space will be the amount of allocatable space that is currently not * part of this metaslab class. */ uint64_t metaslab_class_expandable_space(metaslab_class_t *mc) { vdev_t *rvd = mc->mc_spa->spa_root_vdev; uint64_t space = 0; spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } /* * Calculate if we have enough space to add additional * metaslabs. We report the expandable space in terms * of the metaslab size since that's the unit of expansion. */ space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize, 1ULL << tvd->vdev_ms_shift); } spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (space); } void metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) { multilist_t *ml = &mc->mc_metaslab_txg_list; for (int i = 0; i < multilist_get_num_sublists(ml); i++) { multilist_sublist_t *mls = multilist_sublist_lock(ml, i); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL) { mutex_enter(&msp->ms_lock); /* * If the metaslab has been removed from the list * (which could happen if we were at the memory limit * and it was evicted during this loop), then we can't * proceed and we should restart the sublist. */ if (!multilist_link_active(&msp->ms_class_txg_node)) { mutex_exit(&msp->ms_lock); i--; break; } mls = multilist_sublist_lock(ml, i); metaslab_t *next_msp = multilist_sublist_next(mls, msp); multilist_sublist_unlock(mls); if (txg > msp->ms_selected_txg + metaslab_unload_delay && gethrtime() > msp->ms_selected_time + (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) { metaslab_evict(msp, txg); } else { /* * Once we've hit a metaslab selected too * recently to evict, we're done evicting for * now. */ mutex_exit(&msp->ms_lock); break; } mutex_exit(&msp->ms_lock); msp = next_msp; } } } static int metaslab_compare(const void *x1, const void *x2) { const metaslab_t *m1 = (const metaslab_t *)x1; const metaslab_t *m2 = (const metaslab_t *)x2; int sort1 = 0; int sort2 = 0; if (m1->ms_allocator != -1 && m1->ms_primary) sort1 = 1; else if (m1->ms_allocator != -1 && !m1->ms_primary) sort1 = 2; if (m2->ms_allocator != -1 && m2->ms_primary) sort2 = 1; else if (m2->ms_allocator != -1 && !m2->ms_primary) sort2 = 2; /* * Sort inactive metaslabs first, then primaries, then secondaries. When * selecting a metaslab to allocate from, an allocator first tries its * primary, then secondary active metaslab. If it doesn't have active * metaslabs, or can't allocate from them, it searches for an inactive * metaslab to activate. If it can't find a suitable one, it will steal * a primary or secondary metaslab from another allocator. */ if (sort1 < sort2) return (-1); if (sort1 > sort2) return (1); int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight); if (likely(cmp)) return (cmp); IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); return (TREE_CMP(m1->ms_start, m2->ms_start)); } /* * ========================================================================== * Metaslab groups * ========================================================================== */ /* * Update the allocatable flag and the metaslab group's capacity. * The allocatable flag is set to true if the capacity is below * the zfs_mg_noalloc_threshold or has a fragmentation value that is * greater than zfs_mg_fragmentation_threshold. If a metaslab group * transitions from allocatable to non-allocatable or vice versa then the * metaslab group's class is updated to reflect the transition. */ static void metaslab_group_alloc_update(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; metaslab_class_t *mc = mg->mg_class; vdev_stat_t *vs = &vd->vdev_stat; boolean_t was_allocatable; boolean_t was_initialized; ASSERT(vd == vd->vdev_top); ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, SCL_ALLOC); mutex_enter(&mg->mg_lock); was_allocatable = mg->mg_allocatable; was_initialized = mg->mg_initialized; mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / (vs->vs_space + 1); mutex_enter(&mc->mc_lock); /* * If the metaslab group was just added then it won't * have any space until we finish syncing out this txg. * At that point we will consider it initialized and available * for allocations. We also don't consider non-activated * metaslab groups (e.g. vdevs that are in the middle of being removed) * to be initialized, because they can't be used for allocation. */ mg->mg_initialized = metaslab_group_initialized(mg); if (!was_initialized && mg->mg_initialized) { mc->mc_groups++; } else if (was_initialized && !mg->mg_initialized) { ASSERT3U(mc->mc_groups, >, 0); mc->mc_groups--; } if (mg->mg_initialized) mg->mg_no_free_space = B_FALSE; /* * A metaslab group is considered allocatable if it has plenty * of free space or is not heavily fragmented. We only take * fragmentation into account if the metaslab group has a valid * fragmentation metric (i.e. a value between 0 and 100). */ mg->mg_allocatable = (mg->mg_activation_count > 0 && mg->mg_free_capacity > zfs_mg_noalloc_threshold && (mg->mg_fragmentation == ZFS_FRAG_INVALID || mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); /* * The mc_alloc_groups maintains a count of the number of * groups in this metaslab class that are still above the * zfs_mg_noalloc_threshold. This is used by the allocating * threads to determine if they should avoid allocations to * a given group. The allocator will avoid allocations to a group * if that group has reached or is below the zfs_mg_noalloc_threshold * and there are still other groups that are above the threshold. * When a group transitions from allocatable to non-allocatable or * vice versa we update the metaslab class to reflect that change. * When the mc_alloc_groups value drops to 0 that means that all * groups have reached the zfs_mg_noalloc_threshold making all groups * eligible for allocations. This effectively means that all devices * are balanced again. */ if (was_allocatable && !mg->mg_allocatable) mc->mc_alloc_groups--; else if (!was_allocatable && mg->mg_allocatable) mc->mc_alloc_groups++; mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } int metaslab_sort_by_flushed(const void *va, const void *vb) { const metaslab_t *a = va; const metaslab_t *b = vb; int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); if (likely(cmp)) return (cmp); uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; cmp = TREE_CMP(a_vdev_id, b_vdev_id); if (cmp) return (cmp); return (TREE_CMP(a->ms_id, b->ms_id)); } metaslab_group_t * metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) { metaslab_group_t *mg; mg = kmem_zalloc(offsetof(metaslab_group_t, mg_allocator[allocators]), KM_SLEEP); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); avl_create(&mg->mg_metaslab_tree, metaslab_compare, sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node)); mg->mg_vd = vd; mg->mg_class = mc; mg->mg_activation_count = 0; mg->mg_initialized = B_FALSE; mg->mg_no_free_space = B_TRUE; mg->mg_allocators = allocators; for (int i = 0; i < allocators; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth); } - mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, - maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC); - return (mg); } void metaslab_group_destroy(metaslab_group_t *mg) { ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); /* * We may have gone below zero with the activation count * either because we never activated in the first place or * because we're done, and possibly removing the vdev. */ ASSERT(mg->mg_activation_count <= 0); - taskq_destroy(mg->mg_taskq); avl_destroy(&mg->mg_metaslab_tree); mutex_destroy(&mg->mg_lock); mutex_destroy(&mg->mg_ms_disabled_lock); cv_destroy(&mg->mg_ms_disabled_cv); for (int i = 0; i < mg->mg_allocators; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; zfs_refcount_destroy(&mga->mga_alloc_queue_depth); } kmem_free(mg, offsetof(metaslab_group_t, mg_allocator[mg->mg_allocators])); } void metaslab_group_activate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; spa_t *spa = mc->mc_spa; metaslab_group_t *mgprev, *mgnext; ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count <= 0); if (++mg->mg_activation_count <= 0) return; mg->mg_aliquot = metaslab_aliquot * MAX(1, vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd)); metaslab_group_alloc_update(mg); if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) { mg->mg_prev = mg; mg->mg_next = mg; } else { mgnext = mgprev->mg_next; mg->mg_prev = mgprev; mg->mg_next = mgnext; mgprev->mg_next = mg; mgnext->mg_prev = mg; } for (int i = 0; i < spa->spa_alloc_count; i++) { mc->mc_allocator[i].mca_rotor = mg; mg = mg->mg_next; } } /* * Passivate a metaslab group and remove it from the allocation rotor. * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating * a metaslab group. This function will momentarily drop spa_config_locks * that are lower than the SCL_ALLOC lock (see comment below). */ void metaslab_group_passivate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; spa_t *spa = mc->mc_spa; metaslab_group_t *mgprev, *mgnext; int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, (SCL_ALLOC | SCL_ZIO)); if (--mg->mg_activation_count != 0) { for (int i = 0; i < spa->spa_alloc_count; i++) ASSERT(mc->mc_allocator[i].mca_rotor != mg); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count < 0); return; } /* * The spa_config_lock is an array of rwlocks, ordered as * follows (from highest to lowest): * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > * SCL_ZIO > SCL_FREE > SCL_VDEV * (For more information about the spa_config_lock see spa_misc.c) * The higher the lock, the broader its coverage. When we passivate * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO * config locks. However, the metaslab group's taskq might be trying * to preload metaslabs so we must drop the SCL_ZIO lock and any * lower locks to allow the I/O to complete. At a minimum, * we continue to hold the SCL_ALLOC lock, which prevents any future * allocations from taking place and any changes to the vdev tree. */ spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); - taskq_wait_outstanding(mg->mg_taskq, 0); + taskq_wait_outstanding(spa->spa_metaslab_taskq, 0); spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); metaslab_group_alloc_update(mg); for (int i = 0; i < mg->mg_allocators; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; metaslab_t *msp = mga->mga_primary; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); mutex_exit(&msp->ms_lock); } msp = mga->mga_secondary; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); mutex_exit(&msp->ms_lock); } } mgprev = mg->mg_prev; mgnext = mg->mg_next; if (mg == mgnext) { mgnext = NULL; } else { mgprev->mg_next = mgnext; mgnext->mg_prev = mgprev; } for (int i = 0; i < spa->spa_alloc_count; i++) { if (mc->mc_allocator[i].mca_rotor == mg) mc->mc_allocator[i].mca_rotor = mgnext; } mg->mg_prev = NULL; mg->mg_next = NULL; } boolean_t metaslab_group_initialized(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; vdev_stat_t *vs = &vd->vdev_stat; return (vs->vs_space != 0 && mg->mg_activation_count > 0); } uint64_t metaslab_group_get_space(metaslab_group_t *mg) { /* * Note that the number of nodes in mg_metaslab_tree may be one less * than vdev_ms_count, due to the embedded log metaslab. */ mutex_enter(&mg->mg_lock); uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree); mutex_exit(&mg->mg_lock); return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count); } void metaslab_group_histogram_verify(metaslab_group_t *mg) { uint64_t *mg_hist; avl_tree_t *t = &mg->mg_metaslab_tree; uint64_t ashift = mg->mg_vd->vdev_ashift; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, SPACE_MAP_HISTOGRAM_SIZE + ashift); mutex_enter(&mg->mg_lock); for (metaslab_t *msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { VERIFY3P(msp->ms_group, ==, mg); /* skip if not active */ if (msp->ms_sm == NULL) continue; for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { mg_hist[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } } for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); mutex_exit(&mg->mg_lock); kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); } static void metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) { metaslab_class_t *mc = mg->mg_class; uint64_t ashift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_sm == NULL) return; mutex_enter(&mg->mg_lock); mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { IMPLY(mg == mg->mg_vd->vdev_log_mg, mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } void metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) { metaslab_class_t *mc = mg->mg_class; uint64_t ashift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_sm == NULL) return; mutex_enter(&mg->mg_lock); mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { ASSERT3U(mg->mg_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); ASSERT3U(mc->mc_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); IMPLY(mg == mg->mg_vd->vdev_log_mg, mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; } mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } static void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) { ASSERT(msp->ms_group == NULL); mutex_enter(&mg->mg_lock); msp->ms_group = mg; msp->ms_weight = 0; avl_add(&mg->mg_metaslab_tree, msp); mutex_exit(&mg->mg_lock); mutex_enter(&msp->ms_lock); metaslab_group_histogram_add(mg, msp); mutex_exit(&msp->ms_lock); } static void metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) { mutex_enter(&msp->ms_lock); metaslab_group_histogram_remove(mg, msp); mutex_exit(&msp->ms_lock); mutex_enter(&mg->mg_lock); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); multilist_sublist_unlock(mls); msp->ms_group = NULL; mutex_exit(&mg->mg_lock); } static void metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&mg->mg_lock)); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); msp->ms_weight = weight; avl_add(&mg->mg_metaslab_tree, msp); } static void metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { /* * Although in principle the weight can be any value, in * practice we do not use values in the range [1, 511]. */ ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); ASSERT(MUTEX_HELD(&msp->ms_lock)); mutex_enter(&mg->mg_lock); metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } /* * Calculate the fragmentation for a given metaslab group. We can use * a simple average here since all metaslabs within the group must have * the same size. The return value will be a value between 0 and 100 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this * group have a fragmentation metric. */ uint64_t metaslab_group_fragmentation(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; uint64_t fragmentation = 0; uint64_t valid_ms = 0; for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp->ms_fragmentation == ZFS_FRAG_INVALID) continue; if (msp->ms_group != mg) continue; valid_ms++; fragmentation += msp->ms_fragmentation; } if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) return (ZFS_FRAG_INVALID); fragmentation /= valid_ms; ASSERT3U(fragmentation, <=, 100); return (fragmentation); } /* * Determine if a given metaslab group should skip allocations. A metaslab * group should avoid allocations if its free capacity is less than the * zfs_mg_noalloc_threshold or its fragmentation metric is greater than * zfs_mg_fragmentation_threshold and there is at least one metaslab group * that can still handle allocations. If the allocation throttle is enabled * then we skip allocations to devices that have reached their maximum * allocation queue depth unless the selected metaslab group is the only * eligible group remaining. */ static boolean_t metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, int flags, uint64_t psize, int allocator, int d) { spa_t *spa = mg->mg_vd->vdev_spa; metaslab_class_t *mc = mg->mg_class; /* * We can only consider skipping this metaslab group if it's * in the normal metaslab class and there are other metaslab * groups to select from. Otherwise, we always consider it eligible * for allocations. */ if ((mc != spa_normal_class(spa) && mc != spa_special_class(spa) && mc != spa_dedup_class(spa)) || mc->mc_groups <= 1) return (B_TRUE); /* * If the metaslab group's mg_allocatable flag is set (see comments * in metaslab_group_alloc_update() for more information) and * the allocation throttle is disabled then allow allocations to this * device. However, if the allocation throttle is enabled then * check if we have reached our allocation limit (mga_alloc_queue_depth) * to determine if we should allow allocations to this metaslab group. * If all metaslab groups are no longer considered allocatable * (mc_alloc_groups == 0) or we're trying to allocate the smallest * gang block size then we allow allocations on this metaslab group * regardless of the mg_allocatable or throttle settings. */ if (mg->mg_allocatable) { metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; int64_t qdepth; uint64_t qmax = mga->mga_cur_max_alloc_queue_depth; if (!mc->mc_alloc_throttle_enabled) return (B_TRUE); /* * If this metaslab group does not have any free space, then * there is no point in looking further. */ if (mg->mg_no_free_space) return (B_FALSE); /* * Some allocations (e.g., those coming from device removal * where the * allocations are not even counted in the * metaslab * allocation queues) are allowed to bypass * the throttle. */ if (flags & METASLAB_DONT_THROTTLE) return (B_TRUE); /* * Relax allocation throttling for ditto blocks. Due to * random imbalances in allocation it tends to push copies * to one vdev, that looks a bit better at the moment. */ qmax = qmax * (4 + d) / 4; qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth); /* * If this metaslab group is below its qmax or it's * the only allocatable metaslab group, then attempt * to allocate from it. */ if (qdepth < qmax || mc->mc_alloc_groups == 1) return (B_TRUE); ASSERT3U(mc->mc_alloc_groups, >, 1); /* * Since this metaslab group is at or over its qmax, we * need to determine if there are metaslab groups after this * one that might be able to handle this allocation. This is * racy since we can't hold the locks for all metaslab * groups at the same time when we make this check. */ for (metaslab_group_t *mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { metaslab_group_allocator_t *mgap = &mgp->mg_allocator[allocator]; qmax = mgap->mga_cur_max_alloc_queue_depth; qmax = qmax * (4 + d) / 4; qdepth = zfs_refcount_count(&mgap->mga_alloc_queue_depth); /* * If there is another metaslab group that * might be able to handle the allocation, then * we return false so that we skip this group. */ if (qdepth < qmax && !mgp->mg_no_free_space) return (B_FALSE); } /* * We didn't find another group to handle the allocation * so we can't skip this metaslab group even though * we are at or over our qmax. */ return (B_TRUE); } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { return (B_TRUE); } return (B_FALSE); } /* * ========================================================================== * Range tree callbacks * ========================================================================== */ /* * Comparison function for the private size-ordered tree using 32-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ __attribute__((always_inline)) inline static int metaslab_rangesize32_compare(const void *x1, const void *x2) { const range_seg32_t *r1 = x1; const range_seg32_t *r2 = x2; uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } /* * Comparison function for the private size-ordered tree using 64-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ __attribute__((always_inline)) inline static int metaslab_rangesize64_compare(const void *x1, const void *x2) { const range_seg64_t *r1 = x1; const range_seg64_t *r2 = x2; uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } typedef struct metaslab_rt_arg { zfs_btree_t *mra_bt; uint32_t mra_floor_shift; } metaslab_rt_arg_t; struct mssa_arg { range_tree_t *rt; metaslab_rt_arg_t *mra; }; static void metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size) { struct mssa_arg *mssap = arg; range_tree_t *rt = mssap->rt; metaslab_rt_arg_t *mrap = mssap->mra; range_seg_max_t seg = {0}; rs_set_start(&seg, rt, start); rs_set_end(&seg, rt, start + size); metaslab_rt_add(rt, &seg, mrap); } static void metaslab_size_tree_full_load(range_tree_t *rt) { metaslab_rt_arg_t *mrap = rt->rt_arg; METASLABSTAT_BUMP(metaslabstat_reload_tree); ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); mrap->mra_floor_shift = 0; struct mssa_arg arg = {0}; arg.rt = rt; arg.mra = mrap; range_tree_walk(rt, metaslab_size_sorted_add, &arg); } ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf, range_seg32_t, metaslab_rangesize32_compare) ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf, range_seg64_t, metaslab_rangesize64_compare) /* * Create any block allocator specific components. The current allocators * rely on using both a size-ordered range_tree_t and an array of uint64_t's. */ static void metaslab_rt_create(range_tree_t *rt, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; size_t size; int (*compare) (const void *, const void *); bt_find_in_buf_f bt_find; switch (rt->rt_type) { case RANGE_SEG32: size = sizeof (range_seg32_t); compare = metaslab_rangesize32_compare; bt_find = metaslab_rt_find_rangesize32_in_buf; break; case RANGE_SEG64: size = sizeof (range_seg64_t); compare = metaslab_rangesize64_compare; bt_find = metaslab_rt_find_rangesize64_in_buf; break; default: panic("Invalid range seg type %d", rt->rt_type); } zfs_btree_create(size_tree, compare, bt_find, size); mrap->mra_floor_shift = metaslab_by_size_min_shift; } static void metaslab_rt_destroy(range_tree_t *rt, void *arg) { (void) rt; metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; zfs_btree_destroy(size_tree); kmem_free(mrap, sizeof (*mrap)); } static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1ULL << mrap->mra_floor_shift)) return; zfs_btree_add(size_tree, rs); } static void metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1ULL << mrap->mra_floor_shift)) return; zfs_btree_remove(size_tree, rs); } static void metaslab_rt_vacate(range_tree_t *rt, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; zfs_btree_clear(size_tree); zfs_btree_destroy(size_tree); metaslab_rt_create(rt, arg); } static const range_tree_ops_t metaslab_rt_ops = { .rtop_create = metaslab_rt_create, .rtop_destroy = metaslab_rt_destroy, .rtop_add = metaslab_rt_add, .rtop_remove = metaslab_rt_remove, .rtop_vacate = metaslab_rt_vacate }; /* * ========================================================================== * Common allocator routines * ========================================================================== */ /* * Return the maximum contiguous segment within the metaslab. */ uint64_t metaslab_largest_allocatable(metaslab_t *msp) { zfs_btree_t *t = &msp->ms_allocatable_by_size; range_seg_t *rs; if (t == NULL) return (0); if (zfs_btree_numnodes(t) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); rs = zfs_btree_last(t, NULL); if (rs == NULL) return (0); return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs, msp->ms_allocatable)); } /* * Return the maximum contiguous segment within the unflushed frees of this * metaslab. */ static uint64_t metaslab_largest_unflushed_free(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_unflushed_frees == NULL) return (0); if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) metaslab_size_tree_full_load(msp->ms_unflushed_frees); range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, NULL); if (rs == NULL) return (0); /* * When a range is freed from the metaslab, that range is added to * both the unflushed frees and the deferred frees. While the block * will eventually be usable, if the metaslab were loaded the range * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE * txgs had passed. As a result, when attempting to estimate an upper * bound for the largest currently-usable free segment in the * metaslab, we need to not consider any ranges currently in the defer * trees. This algorithm approximates the largest available chunk in * the largest range in the unflushed_frees tree by taking the first * chunk. While this may be a poor estimate, it should only remain so * briefly and should eventually self-correct as frees are no longer * deferred. Similar logic applies to the ms_freed tree. See * metaslab_load() for more details. * * There are two primary sources of inaccuracy in this estimate. Both * are tolerated for performance reasons. The first source is that we * only check the largest segment for overlaps. Smaller segments may * have more favorable overlaps with the other trees, resulting in * larger usable chunks. Second, we only look at the first chunk in * the largest segment; there may be other usable chunks in the * largest segment, but we ignore them. */ uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees); uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart; for (int t = 0; t < TXG_DEFER_SIZE; t++) { uint64_t start = 0; uint64_t size = 0; boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart, rsize, &start, &size); if (found) { if (rstart == start) return (0); rsize = start - rstart; } } uint64_t start = 0; uint64_t size = 0; boolean_t found = range_tree_find_in(msp->ms_freed, rstart, rsize, &start, &size); if (found) rsize = start - rstart; return (rsize); } static range_seg_t * metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, uint64_t size, zfs_btree_index_t *where) { range_seg_t *rs; range_seg_max_t rsearch; rs_set_start(&rsearch, rt, start); rs_set_end(&rsearch, rt, start + size); rs = zfs_btree_find(t, &rsearch, where); if (rs == NULL) { rs = zfs_btree_next(t, where, where); } return (rs); } /* * This is a helper function that can be used by the allocator to find a * suitable block to allocate. This will search the specified B-tree looking * for a block that matches the specified criteria. */ static uint64_t metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size, uint64_t max_search) { if (*cursor == 0) *cursor = rt->rt_start; zfs_btree_t *bt = &rt->rt_root; zfs_btree_index_t where; range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where); uint64_t first_found; int count_searched = 0; if (rs != NULL) first_found = rs_get_start(rs, rt); while (rs != NULL && (rs_get_start(rs, rt) - first_found <= max_search || count_searched < metaslab_min_search_count)) { uint64_t offset = rs_get_start(rs, rt); if (offset + size <= rs_get_end(rs, rt)) { *cursor = offset + size; return (offset); } rs = zfs_btree_next(bt, &where, &where); count_searched++; } *cursor = 0; return (-1ULL); } static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size); static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size); static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size); metaslab_ops_t *metaslab_allocator(spa_t *spa); static metaslab_ops_t metaslab_allocators[] = { { "dynamic", metaslab_df_alloc }, { "cursor", metaslab_cf_alloc }, { "new-dynamic", metaslab_ndf_alloc }, }; static int spa_find_allocator_byname(const char *val) { int a = ARRAY_SIZE(metaslab_allocators) - 1; if (strcmp("new-dynamic", val) == 0) return (-1); /* remove when ndf is working */ for (; a >= 0; a--) { if (strcmp(val, metaslab_allocators[a].msop_name) == 0) return (a); } return (-1); } void spa_set_allocator(spa_t *spa, const char *allocator) { int a = spa_find_allocator_byname(allocator); if (a < 0) a = 0; spa->spa_active_allocator = a; zfs_dbgmsg("spa allocator: %s\n", metaslab_allocators[a].msop_name); } int spa_get_allocator(spa_t *spa) { return (spa->spa_active_allocator); } #if defined(_KERNEL) int param_set_active_allocator_common(const char *val) { char *p; if (val == NULL) return (SET_ERROR(EINVAL)); if ((p = strchr(val, '\n')) != NULL) *p = '\0'; int a = spa_find_allocator_byname(val); if (a < 0) return (SET_ERROR(EINVAL)); zfs_active_allocator = metaslab_allocators[a].msop_name; return (0); } #endif metaslab_ops_t * metaslab_allocator(spa_t *spa) { int allocator = spa_get_allocator(spa); return (&metaslab_allocators[allocator]); } /* * ========================================================================== * Dynamic Fit (df) block allocator * * Search for a free chunk of at least this size, starting from the last * offset (for this alignment of block) looking for up to * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not * found within 16MB, then return a free chunk of exactly the requested size (or * larger). * * If it seems like searching from the last offset will be unproductive, skip * that and just return a free chunk of exactly the requested size (or larger). * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This * mechanism is probably not very useful and may be removed in the future. * * The behavior when not searching can be changed to return the largest free * chunk, instead of a free chunk of exactly the requested size, by setting * metaslab_df_use_largest_segment. * ========================================================================== */ static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size) { /* * Find the largest power of 2 block size that evenly divides the * requested size. This is used to try to allocate blocks with similar * alignment from the same area of the metaslab (i.e. same cursor * bucket) but it does not guarantee that other allocations sizes * may exist in the same region. */ uint64_t align = size & -size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; range_tree_t *rt = msp->ms_allocatable; uint_t free_pct = range_tree_space(rt) * 100 / msp->ms_size; uint64_t offset; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * If we're running low on space, find a segment based on size, * rather than iterating based on offset. */ if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { offset = -1; } else { offset = metaslab_block_picker(rt, cursor, size, metaslab_df_max_search); } if (offset == -1) { range_seg_t *rs; if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); if (metaslab_df_use_largest_segment) { /* use largest free segment */ rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); } else { zfs_btree_index_t where; /* use segment of this size, or next largest */ rs = metaslab_block_find(&msp->ms_allocatable_by_size, rt, msp->ms_start, size, &where); } if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs, rt)) { offset = rs_get_start(rs, rt); *cursor = offset + size; } } return (offset); } /* * ========================================================================== * Cursor fit block allocator - * Select the largest region in the metaslab, set the cursor to the beginning * of the range and the cursor_end to the end of the range. As allocations * are made advance the cursor. Continue allocating from the cursor until * the range is exhausted and then find a new range. * ========================================================================== */ static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size) { range_tree_t *rt = msp->ms_allocatable; zfs_btree_t *t = &msp->ms_allocatable_by_size; uint64_t *cursor = &msp->ms_lbas[0]; uint64_t *cursor_end = &msp->ms_lbas[1]; uint64_t offset = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(*cursor_end, >=, *cursor); if ((*cursor + size) > *cursor_end) { range_seg_t *rs; if (zfs_btree_numnodes(t) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); rs = zfs_btree_last(t, NULL); if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) return (-1ULL); *cursor = rs_get_start(rs, rt); *cursor_end = rs_get_end(rs, rt); } offset = *cursor; *cursor += size; return (offset); } /* * ========================================================================== * New dynamic fit allocator - * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift * contiguous blocks. If no region is found then just use the largest segment * that remains. * ========================================================================== */ /* * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) * to request from the allocator. */ uint64_t metaslab_ndf_clump_shift = 4; static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) { zfs_btree_t *t = &msp->ms_allocatable->rt_root; range_tree_t *rt = msp->ms_allocatable; zfs_btree_index_t where; range_seg_t *rs; range_seg_max_t rsearch; uint64_t hbit = highbit64(size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; uint64_t max_size = metaslab_largest_allocatable(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); if (max_size < size) return (-1ULL); rs_set_start(&rsearch, rt, *cursor); rs_set_end(&rsearch, rt, *cursor + size); rs = zfs_btree_find(t, &rsearch, &where); if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) { t = &msp->ms_allocatable_by_size; rs_set_start(&rsearch, rt, 0); rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit + metaslab_ndf_clump_shift))); rs = zfs_btree_find(t, &rsearch, &where); if (rs == NULL) rs = zfs_btree_next(t, &where, &where); ASSERT(rs != NULL); } if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) { *cursor = rs_get_start(rs, rt) + size; return (rs_get_start(rs, rt)); } return (-1ULL); } /* * ========================================================================== * Metaslabs * ========================================================================== */ /* * Wait for any in-progress metaslab loads to complete. */ static void metaslab_load_wait(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); while (msp->ms_loading) { ASSERT(!msp->ms_loaded); cv_wait(&msp->ms_load_cv, &msp->ms_lock); } } /* * Wait for any in-progress flushing to complete. */ static void metaslab_flush_wait(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); while (msp->ms_flushing) cv_wait(&msp->ms_flush_cv, &msp->ms_lock); } static unsigned int metaslab_idx_func(multilist_t *ml, void *arg) { metaslab_t *msp = arg; /* * ms_id values are allocated sequentially, so full 64bit * division would be a waste of time, so limit it to 32 bits. */ return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml)); } uint64_t metaslab_allocated_space(metaslab_t *msp) { return (msp->ms_allocated_space); } /* * Verify that the space accounting on disk matches the in-core range_trees. */ static void metaslab_verify_space(metaslab_t *msp, uint64_t txg) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; uint64_t allocating = 0; uint64_t sm_free_space, msp_free_space; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(!msp->ms_condensing); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; /* * We can only verify the metaslab space when we're called * from syncing context with a loaded metaslab that has an * allocated space map. Calling this in non-syncing context * does not provide a consistent view of the metaslab since * we're performing allocations in the future. */ if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || !msp->ms_loaded) return; /* * Even though the smp_alloc field can get negative, * when it comes to a metaslab's space map, that should * never be the case. */ ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); ASSERT3U(space_map_allocated(msp->ms_sm), >=, range_tree_space(msp->ms_unflushed_frees)); ASSERT3U(metaslab_allocated_space(msp), ==, space_map_allocated(msp->ms_sm) + range_tree_space(msp->ms_unflushed_allocs) - range_tree_space(msp->ms_unflushed_frees)); sm_free_space = msp->ms_size - metaslab_allocated_space(msp); /* * Account for future allocations since we would have * already deducted that space from the ms_allocatable. */ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { allocating += range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); } ASSERT3U(allocating + msp->ms_allocated_this_txg, ==, msp->ms_allocating_total); ASSERT3U(msp->ms_deferspace, ==, range_tree_space(msp->ms_defer[0]) + range_tree_space(msp->ms_defer[1])); msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + msp->ms_deferspace + range_tree_space(msp->ms_freed); VERIFY3U(sm_free_space, ==, msp_free_space); } static void metaslab_aux_histograms_clear(metaslab_t *msp) { /* * Auxiliary histograms are only cleared when resetting them, * which can only happen while the metaslab is loaded. */ ASSERT(msp->ms_loaded); memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); for (int t = 0; t < TXG_DEFER_SIZE; t++) memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t])); } static void metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, range_tree_t *rt) { /* * This is modeled after space_map_histogram_add(), so refer to that * function for implementation details. We want this to work like * the space map histogram, and not the range tree histogram, as we * are essentially constructing a delta that will be later subtracted * from the space map histogram. */ int idx = 0; for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT3U(i, >=, idx + shift); histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { ASSERT3U(idx + shift, ==, i); idx++; ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); } } } /* * Called at every sync pass that the metaslab gets synced. * * The reason is that we want our auxiliary histograms to be updated * wherever the metaslab's space map histogram is updated. This way * we stay consistent on which parts of the metaslab space map's * histogram are currently not available for allocations (e.g because * they are in the defer, freed, and freeing trees). */ static void metaslab_aux_histograms_update(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; ASSERT(sm != NULL); /* * This is similar to the metaslab's space map histogram updates * that take place in metaslab_sync(). The only difference is that * we only care about segments that haven't made it into the * ms_allocatable tree yet. */ if (msp->ms_loaded) { metaslab_aux_histograms_clear(msp); metaslab_aux_histogram_add(msp->ms_synchist, sm->sm_shift, msp->ms_freed); for (int t = 0; t < TXG_DEFER_SIZE; t++) { metaslab_aux_histogram_add(msp->ms_deferhist[t], sm->sm_shift, msp->ms_defer[t]); } } metaslab_aux_histogram_add(msp->ms_synchist, sm->sm_shift, msp->ms_freeing); } /* * Called every time we are done syncing (writing to) the metaslab, * i.e. at the end of each sync pass. * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] */ static void metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; space_map_t *sm = msp->ms_sm; if (sm == NULL) { /* * We came here from metaslab_init() when creating/opening a * pool, looking at a metaslab that hasn't had any allocations * yet. */ return; } /* * This is similar to the actions that we take for the ms_freed * and ms_defer trees in metaslab_sync_done(). */ uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; if (defer_allowed) { memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist, sizeof (msp->ms_synchist)); } else { memset(msp->ms_deferhist[hist_index], 0, sizeof (msp->ms_deferhist[hist_index])); } memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); } /* * Ensure that the metaslab's weight and fragmentation are consistent * with the contents of the histogram (either the range tree's histogram * or the space map's depending whether the metaslab is loaded). */ static void metaslab_verify_weight_and_frag(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; /* * We can end up here from vdev_remove_complete(), in which case we * cannot do these assertions because we hold spa config locks and * thus we are not allowed to read from the DMU. * * We check if the metaslab group has been removed and if that's * the case we return immediately as that would mean that we are * here from the aforementioned code path. */ if (msp->ms_group == NULL) return; /* * Devices being removed always return a weight of 0 and leave * fragmentation and ms_max_size as is - there is nothing for * us to verify here. */ vdev_t *vd = msp->ms_group->mg_vd; if (vd->vdev_removing) return; /* * If the metaslab is dirty it probably means that we've done * some allocations or frees that have changed our histograms * and thus the weight. */ for (int t = 0; t < TXG_SIZE; t++) { if (txg_list_member(&vd->vdev_ms_list, msp, t)) return; } /* * This verification checks that our in-memory state is consistent * with what's on disk. If the pool is read-only then there aren't * any changes and we just have the initially-loaded state. */ if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) return; /* some extra verification for in-core tree if you can */ if (msp->ms_loaded) { range_tree_stat_verify(msp->ms_allocatable); VERIFY(space_map_histogram_verify(msp->ms_sm, msp->ms_allocatable)); } uint64_t weight = msp->ms_weight; uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); uint64_t frag = msp->ms_fragmentation; uint64_t max_segsize = msp->ms_max_size; msp->ms_weight = 0; msp->ms_fragmentation = 0; /* * This function is used for verification purposes and thus should * not introduce any side-effects/mutations on the system's state. * * Regardless of whether metaslab_weight() thinks this metaslab * should be active or not, we want to ensure that the actual weight * (and therefore the value of ms_weight) would be the same if it * was to be recalculated at this point. * * In addition we set the nodirty flag so metaslab_weight() does * not dirty the metaslab for future TXGs (e.g. when trying to * force condensing to upgrade the metaslab spacemaps). */ msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active; VERIFY3U(max_segsize, ==, msp->ms_max_size); /* * If the weight type changed then there is no point in doing * verification. Revert fields to their original values. */ if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { msp->ms_fragmentation = frag; msp->ms_weight = weight; return; } VERIFY3U(msp->ms_fragmentation, ==, frag); VERIFY3U(msp->ms_weight, ==, weight); } /* * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from * this class that was used longest ago, and attempt to unload it. We don't * want to spend too much time in this loop to prevent performance * degradation, and we expect that most of the time this operation will * succeed. Between that and the normal unloading processing during txg sync, * we expect this to keep the metaslab memory usage under control. */ static void metaslab_potentially_evict(metaslab_class_t *mc) { #ifdef _KERNEL uint64_t allmem = arc_all_memory(); uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); uint64_t size = spl_kmem_cache_entry_size(zfs_btree_leaf_cache); uint_t tries = 0; for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size && tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2; tries++) { unsigned int idx = multilist_get_random_index( &mc->mc_metaslab_txg_list); multilist_sublist_t *mls = multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 < inuse * size) { VERIFY3P(mls, ==, multilist_sublist_lock( &mc->mc_metaslab_txg_list, idx)); ASSERT3U(idx, ==, metaslab_idx_func(&mc->mc_metaslab_txg_list, msp)); if (!multilist_link_active(&msp->ms_class_txg_node)) { multilist_sublist_unlock(mls); break; } metaslab_t *next_msp = multilist_sublist_next(mls, msp); multilist_sublist_unlock(mls); /* * If the metaslab is currently loading there are two * cases. If it's the metaslab we're evicting, we * can't continue on or we'll panic when we attempt to * recursively lock the mutex. If it's another * metaslab that's loading, it can be safely skipped, * since we know it's very new and therefore not a * good eviction candidate. We check later once the * lock is held that the metaslab is fully loaded * before actually unloading it. */ if (msp->ms_loading) { msp = next_msp; inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); continue; } /* * We can't unload metaslabs with no spacemap because * they're not ready to be unloaded yet. We can't * unload metaslabs with outstanding allocations * because doing so could cause the metaslab's weight * to decrease while it's unloaded, which violates an * invariant that we use to prevent unnecessary * loading. We also don't unload metaslabs that are * currently active because they are high-weight * metaslabs that are likely to be used in the near * future. */ mutex_enter(&msp->ms_lock); if (msp->ms_allocator == -1 && msp->ms_sm != NULL && msp->ms_allocating_total == 0) { metaslab_unload(msp); } mutex_exit(&msp->ms_lock); msp = next_msp; inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); } } #else (void) mc, (void) zfs_metaslab_mem_limit; #endif } static int metaslab_load_impl(metaslab_t *msp) { int error = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loading); ASSERT(!msp->ms_condensing); /* * We temporarily drop the lock to unblock other operations while we * are reading the space map. Therefore, metaslab_sync() and * metaslab_sync_done() can run at the same time as we do. * * If we are using the log space maps, metaslab_sync() can't write to * the metaslab's space map while we are loading as we only write to * it when we are flushing the metaslab, and that can't happen while * we are loading it. * * If we are not using log space maps though, metaslab_sync() can * append to the space map while we are loading. Therefore we load * only entries that existed when we started the load. Additionally, * metaslab_sync_done() has to wait for the load to complete because * there are potential races like metaslab_load() loading parts of the * space map that are currently being appended by metaslab_sync(). If * we didn't, the ms_allocatable would have entries that * metaslab_sync_done() would try to re-add later. * * That's why before dropping the lock we remember the synced length * of the metaslab and read up to that point of the space map, * ignoring entries appended by metaslab_sync() that happen after we * drop the lock. */ uint64_t length = msp->ms_synced_length; mutex_exit(&msp->ms_lock); hrtime_t load_start = gethrtime(); metaslab_rt_arg_t *mrap; if (msp->ms_allocatable->rt_arg == NULL) { mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); } else { mrap = msp->ms_allocatable->rt_arg; msp->ms_allocatable->rt_ops = NULL; msp->ms_allocatable->rt_arg = NULL; } mrap->mra_bt = &msp->ms_allocatable_by_size; mrap->mra_floor_shift = metaslab_by_size_min_shift; if (msp->ms_sm != NULL) { error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, SM_FREE, length); /* Now, populate the size-sorted tree. */ metaslab_rt_create(msp->ms_allocatable, mrap); msp->ms_allocatable->rt_ops = &metaslab_rt_ops; msp->ms_allocatable->rt_arg = mrap; struct mssa_arg arg = {0}; arg.rt = msp->ms_allocatable; arg.mra = mrap; range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add, &arg); } else { /* * Add the size-sorted tree first, since we don't need to load * the metaslab from the spacemap. */ metaslab_rt_create(msp->ms_allocatable, mrap); msp->ms_allocatable->rt_ops = &metaslab_rt_ops; msp->ms_allocatable->rt_arg = mrap; /* * The space map has not been allocated yet, so treat * all the space in the metaslab as free and add it to the * ms_allocatable tree. */ range_tree_add(msp->ms_allocatable, msp->ms_start, msp->ms_size); if (msp->ms_new) { /* * If the ms_sm doesn't exist, this means that this * metaslab hasn't gone through metaslab_sync() and * thus has never been dirtied. So we shouldn't * expect any unflushed allocs or frees from previous * TXGs. */ ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); } } /* * We need to grab the ms_sync_lock to prevent metaslab_sync() from * changing the ms_sm (or log_sm) and the metaslab's range trees * while we are about to use them and populate the ms_allocatable. * The ms_lock is insufficient for this because metaslab_sync() doesn't * hold the ms_lock while writing the ms_checkpointing tree to disk. */ mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); ASSERT(!msp->ms_condensing); ASSERT(!msp->ms_flushing); if (error != 0) { mutex_exit(&msp->ms_sync_lock); return (error); } ASSERT3P(msp->ms_group, !=, NULL); msp->ms_loaded = B_TRUE; /* * Apply all the unflushed changes to ms_allocatable right * away so any manipulations we do below have a clear view * of what is allocated and what is free. */ range_tree_walk(msp->ms_unflushed_allocs, range_tree_remove, msp->ms_allocatable); range_tree_walk(msp->ms_unflushed_frees, range_tree_add, msp->ms_allocatable); ASSERT3P(msp->ms_group, !=, NULL); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (spa_syncing_log_sm(spa) != NULL) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); /* * If we use a log space map we add all the segments * that are in ms_unflushed_frees so they are available * for allocation. * * ms_allocatable needs to contain all free segments * that are ready for allocations (thus not segments * from ms_freeing, ms_freed, and the ms_defer trees). * But if we grab the lock in this code path at a sync * pass later that 1, then it also contains the * segments of ms_freed (they were added to it earlier * in this path through ms_unflushed_frees). So we * need to remove all the segments that exist in * ms_freed from ms_allocatable as they will be added * later in metaslab_sync_done(). * * When there's no log space map, the ms_allocatable * correctly doesn't contain any segments that exist * in ms_freed [see ms_synced_length]. */ range_tree_walk(msp->ms_freed, range_tree_remove, msp->ms_allocatable); } /* * If we are not using the log space map, ms_allocatable * contains the segments that exist in the ms_defer trees * [see ms_synced_length]. Thus we need to remove them * from ms_allocatable as they will be added again in * metaslab_sync_done(). * * If we are using the log space map, ms_allocatable still * contains the segments that exist in the ms_defer trees. * Not because it read them through the ms_sm though. But * because these segments are part of ms_unflushed_frees * whose segments we add to ms_allocatable earlier in this * code path. */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], range_tree_remove, msp->ms_allocatable); } /* * Call metaslab_recalculate_weight_and_sort() now that the * metaslab is loaded so we get the metaslab's real weight. * * Unless this metaslab was created with older software and * has not yet been converted to use segment-based weight, we * expect the new weight to be better or equal to the weight * that the metaslab had while it was not loaded. This is * because the old weight does not take into account the * consolidation of adjacent segments between TXGs. [see * comment for ms_synchist and ms_deferhist[] for more info] */ uint64_t weight = msp->ms_weight; uint64_t max_size = msp->ms_max_size; metaslab_recalculate_weight_and_sort(msp); if (!WEIGHT_IS_SPACEBASED(weight)) ASSERT3U(weight, <=, msp->ms_weight); msp->ms_max_size = metaslab_largest_allocatable(msp); ASSERT3U(max_size, <=, msp->ms_max_size); hrtime_t load_end = gethrtime(); msp->ms_load_time = load_end; zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, smp_length %llu, " "unflushed_allocs %llu, unflushed_frees %llu, " "freed %llu, defer %llu + %llu, unloaded time %llu ms, " "loading_time %lld ms, ms_max_size %llu, " "max size error %lld, " "old_weight %llx, new_weight %llx", (u_longlong_t)spa_syncing_txg(spa), spa_name(spa), (u_longlong_t)msp->ms_group->mg_vd->vdev_id, (u_longlong_t)msp->ms_id, (u_longlong_t)space_map_length(msp->ms_sm), (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs), (u_longlong_t)range_tree_space(msp->ms_unflushed_frees), (u_longlong_t)range_tree_space(msp->ms_freed), (u_longlong_t)range_tree_space(msp->ms_defer[0]), (u_longlong_t)range_tree_space(msp->ms_defer[1]), (longlong_t)((load_start - msp->ms_unload_time) / 1000000), (longlong_t)((load_end - load_start) / 1000000), (u_longlong_t)msp->ms_max_size, (u_longlong_t)msp->ms_max_size - max_size, (u_longlong_t)weight, (u_longlong_t)msp->ms_weight); metaslab_verify_space(msp, spa_syncing_txg(spa)); mutex_exit(&msp->ms_sync_lock); return (0); } int metaslab_load(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * There may be another thread loading the same metaslab, if that's * the case just wait until the other thread is done and return. */ metaslab_load_wait(msp); if (msp->ms_loaded) return (0); VERIFY(!msp->ms_loading); ASSERT(!msp->ms_condensing); /* * We set the loading flag BEFORE potentially dropping the lock to * wait for an ongoing flush (see ms_flushing below). This way other * threads know that there is already a thread that is loading this * metaslab. */ msp->ms_loading = B_TRUE; /* * Wait for any in-progress flushing to finish as we drop the ms_lock * both here (during space_map_load()) and in metaslab_flush() (when * we flush our changes to the ms_sm). */ if (msp->ms_flushing) metaslab_flush_wait(msp); /* * In the possibility that we were waiting for the metaslab to be * flushed (where we temporarily dropped the ms_lock), ensure that * no one else loaded the metaslab somehow. */ ASSERT(!msp->ms_loaded); /* * If we're loading a metaslab in the normal class, consider evicting * another one to keep our memory usage under the limit defined by the * zfs_metaslab_mem_limit tunable. */ if (spa_normal_class(msp->ms_group->mg_class->mc_spa) == msp->ms_group->mg_class) { metaslab_potentially_evict(msp->ms_group->mg_class); } int error = metaslab_load_impl(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); msp->ms_loading = B_FALSE; cv_broadcast(&msp->ms_load_cv); return (error); } void metaslab_unload(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * This can happen if a metaslab is selected for eviction (in * metaslab_potentially_evict) and then unloaded during spa_sync (via * metaslab_class_evict_old). */ if (!msp->ms_loaded) return; range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; msp->ms_unload_time = gethrtime(); msp->ms_activation_weight = 0; msp->ms_weight &= ~METASLAB_ACTIVE_MASK; if (msp->ms_group != NULL) { metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); multilist_sublist_unlock(mls); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, weight %llx, " "selected txg %llu (%llu ms ago), alloc_txg %llu, " "loaded %llu ms ago, max_size %llu", (u_longlong_t)spa_syncing_txg(spa), spa_name(spa), (u_longlong_t)msp->ms_group->mg_vd->vdev_id, (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_weight, (u_longlong_t)msp->ms_selected_txg, (u_longlong_t)(msp->ms_unload_time - msp->ms_selected_time) / 1000 / 1000, (u_longlong_t)msp->ms_alloc_txg, (u_longlong_t)(msp->ms_unload_time - msp->ms_load_time) / 1000 / 1000, (u_longlong_t)msp->ms_max_size); } /* * We explicitly recalculate the metaslab's weight based on its space * map (as it is now not loaded). We want unload metaslabs to always * have their weights calculated from the space map histograms, while * loaded ones have it calculated from their in-core range tree * [see metaslab_load()]. This way, the weight reflects the information * available in-core, whether it is loaded or not. * * If ms_group == NULL means that we came here from metaslab_fini(), * at which point it doesn't make sense for us to do the recalculation * and the sorting. */ if (msp->ms_group != NULL) metaslab_recalculate_weight_and_sort(msp); } /* * We want to optimize the memory use of the per-metaslab range * trees. To do this, we store the segments in the range trees in * units of sectors, zero-indexing from the start of the metaslab. If * the vdev_ms_shift - the vdev_ashift is less than 32, we can store * the ranges using two uint32_ts, rather than two uint64_ts. */ range_seg_type_t metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, uint64_t *start, uint64_t *shift) { if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 && !zfs_metaslab_force_large_segs) { *shift = vdev->vdev_ashift; *start = msp->ms_start; return (RANGE_SEG32); } else { *shift = 0; *start = 0; return (RANGE_SEG64); } } void metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg) { ASSERT(MUTEX_HELD(&msp->ms_lock)); metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); msp->ms_selected_txg = txg; msp->ms_selected_time = gethrtime(); multilist_sublist_insert_tail(mls, msp); multilist_sublist_unlock(mls); } void metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { vdev_space_update(vd, alloc_delta, defer_delta, space_delta); ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); ASSERT(vd->vdev_ms_count != 0); metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, vdev_deflated_space(vd, space_delta)); } int metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, metaslab_t **msp) { vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; metaslab_t *ms; int error; ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); multilist_link_init(&ms->ms_class_txg_node); ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; ms->ms_size = 1ULL << vd->vdev_ms_shift; ms->ms_allocator = -1; ms->ms_new = B_TRUE; vdev_ops_t *ops = vd->vdev_ops; if (ops->vdev_op_metaslab_init != NULL) ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size); /* * We only open space map objects that already exist. All others * will be opened when we finally allocate an object for it. For * readonly pools there is no need to open the space map object. * * Note: * When called from vdev_expand(), we can't call into the DMU as * we are holding the spa_config_lock as a writer and we would * deadlock [see relevant comment in vdev_metaslab_init()]. in * that case, the object parameter is zero though, so we won't * call into the DMU. */ if (object != 0 && !(spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps)) { error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, ms->ms_size, vd->vdev_ashift); if (error != 0) { kmem_free(ms, sizeof (metaslab_t)); return (error); } ASSERT(ms->ms_sm != NULL); ms->ms_allocated_space = space_map_allocated(ms->ms_sm); } uint64_t shift, start; range_seg_type_t type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift); ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_SIZE; t++) { ms->ms_allocating[t] = range_tree_create(NULL, type, NULL, start, shift); } ms->ms_freeing = range_tree_create(NULL, type, NULL, start, shift); ms->ms_freed = range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { ms->ms_defer[t] = range_tree_create(NULL, type, NULL, start, shift); } ms->ms_checkpointing = range_tree_create(NULL, type, NULL, start, shift); ms->ms_unflushed_allocs = range_tree_create(NULL, type, NULL, start, shift); metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); mrap->mra_bt = &ms->ms_unflushed_frees_by_size; mrap->mra_floor_shift = metaslab_by_size_min_shift; ms->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops, type, mrap, start, shift); ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms, B_FALSE); /* * If we're opening an existing pool (txg == 0) or creating * a new one (txg == TXG_INITIAL), all space is available now. * If we're adding space to an existing pool, the new space * does not become available until after this txg has synced. * The metaslab's weight will also be initialized when we sync * out this txg. This ensures that we don't attempt to allocate * from it before we have initialized it completely. */ if (txg <= TXG_INITIAL) { metaslab_sync_done(ms, 0); metaslab_space_update(vd, mg->mg_class, metaslab_allocated_space(ms), 0, 0); } if (txg != 0) { vdev_dirty(vd, 0, NULL, txg); vdev_dirty(vd, VDD_METASLAB, ms, txg); } *msp = ms; return (0); } static void metaslab_fini_flush_data(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (metaslab_unflushed_txg(msp) == 0) { ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, NULL); return; } ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); mutex_enter(&spa->spa_flushed_ms_lock); avl_remove(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp), metaslab_unflushed_dirty(msp)); } uint64_t metaslab_unflushed_changes_memused(metaslab_t *ms) { return ((range_tree_numsegs(ms->ms_unflushed_allocs) + range_tree_numsegs(ms->ms_unflushed_frees)) * ms->ms_unflushed_allocs->rt_root.bt_elem_size); } void metaslab_fini(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; metaslab_fini_flush_data(msp); metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); /* * If this metaslab hasn't been through metaslab_sync_done() yet its * space hasn't been accounted for in its vdev and doesn't need to be * subtracted. */ if (!msp->ms_new) { metaslab_space_update(vd, mg->mg_class, -metaslab_allocated_space(msp), 0, -msp->ms_size); } space_map_close(msp->ms_sm); msp->ms_sm = NULL; metaslab_unload(msp); range_tree_destroy(msp->ms_allocatable); range_tree_destroy(msp->ms_freeing); range_tree_destroy(msp->ms_freed); ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); range_tree_destroy(msp->ms_unflushed_allocs); range_tree_destroy(msp->ms_checkpointing); range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); range_tree_destroy(msp->ms_unflushed_frees); for (int t = 0; t < TXG_SIZE; t++) { range_tree_destroy(msp->ms_allocating[t]); } for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_destroy(msp->ms_defer[t]); } ASSERT0(msp->ms_deferspace); for (int t = 0; t < TXG_SIZE; t++) ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); range_tree_vacate(msp->ms_trim, NULL, NULL); range_tree_destroy(msp->ms_trim); mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); cv_destroy(&msp->ms_flush_cv); mutex_destroy(&msp->ms_lock); mutex_destroy(&msp->ms_sync_lock); ASSERT3U(msp->ms_allocator, ==, -1); kmem_free(msp, sizeof (metaslab_t)); } #define FRAGMENTATION_TABLE_SIZE 17 /* * This table defines a segment size based fragmentation metric that will * allow each metaslab to derive its own fragmentation value. This is done * by calculating the space in each bucket of the spacemap histogram and * multiplying that by the fragmentation metric in this table. Doing * this for all buckets and dividing it by the total amount of free * space in this metaslab (i.e. the total free space in all buckets) gives * us the fragmentation metric. This means that a high fragmentation metric * equates to most of the free space being comprised of small segments. * Conversely, if the metric is low, then most of the free space is in * large segments. A 10% change in fragmentation equates to approximately * double the number of segments. * * This table defines 0% fragmented space using 16MB segments. Testing has * shown that segments that are greater than or equal to 16MB do not suffer * from drastic performance problems. Using this value, we derive the rest * of the table. Since the fragmentation value is never stored on disk, it * is possible to change these calculations in the future. */ static const int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 100, /* 512B */ 100, /* 1K */ 98, /* 2K */ 95, /* 4K */ 90, /* 8K */ 80, /* 16K */ 70, /* 32K */ 60, /* 64K */ 50, /* 128K */ 40, /* 256K */ 30, /* 512K */ 20, /* 1M */ 15, /* 2M */ 10, /* 4M */ 5, /* 8M */ 0 /* 16M */ }; /* * Calculate the metaslab's fragmentation metric and set ms_fragmentation. * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not * been upgraded and does not support this metric. Otherwise, the return * value should be in the range [0, 100]. */ static void metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; uint64_t fragmentation = 0; uint64_t total = 0; boolean_t feature_enabled = spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM); if (!feature_enabled) { msp->ms_fragmentation = ZFS_FRAG_INVALID; return; } /* * A null space map means that the entire metaslab is free * and thus is not fragmented. */ if (msp->ms_sm == NULL) { msp->ms_fragmentation = 0; return; } /* * If this metaslab's space map has not been upgraded, flag it * so that we upgrade next time we encounter it. */ if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { uint64_t txg = spa_syncing_txg(spa); vdev_t *vd = msp->ms_group->mg_vd; /* * If we've reached the final dirty txg, then we must * be shutting down the pool. We don't want to dirty * any data past this point so skip setting the condense * flag. We can retry this action the next time the pool * is imported. We also skip marking this metaslab for * condensing if the caller has explicitly set nodirty. */ if (!nodirty && spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { msp->ms_condense_wanted = B_TRUE; vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); zfs_dbgmsg("txg %llu, requesting force condense: " "ms_id %llu, vdev_id %llu", (u_longlong_t)txg, (u_longlong_t)msp->ms_id, (u_longlong_t)vd->vdev_id); } msp->ms_fragmentation = ZFS_FRAG_INVALID; return; } for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { uint64_t space = 0; uint8_t shift = msp->ms_sm->sm_shift; int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, FRAGMENTATION_TABLE_SIZE - 1); if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) continue; space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); total += space; ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); fragmentation += space * zfs_frag_table[idx]; } if (total > 0) fragmentation /= total; ASSERT3U(fragmentation, <=, 100); msp->ms_fragmentation = fragmentation; } /* * Compute a weight -- a selection preference value -- for the given metaslab. * This is based on the amount of free space, the level of fragmentation, * the LBA range, and whether the metaslab is loaded. */ static uint64_t metaslab_space_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; uint64_t weight, space; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The baseline weight is the metaslab's free space. */ space = msp->ms_size - metaslab_allocated_space(msp); if (metaslab_fragmentation_factor_enabled && msp->ms_fragmentation != ZFS_FRAG_INVALID) { /* * Use the fragmentation information to inversely scale * down the baseline weight. We need to ensure that we * don't exclude this metaslab completely when it's 100% * fragmented. To avoid this we reduce the fragmented value * by 1. */ space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; /* * If space < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. The fragmentation metric may have * decreased the space to something smaller than * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE * so that we can consume any remaining space. */ if (space > 0 && space < SPA_MINBLOCKSIZE) space = SPA_MINBLOCKSIZE; } weight = space; /* * Modern disks have uniform bit density and constant angular velocity. * Therefore, the outer recording zones are faster (higher bandwidth) * than the inner zones by the ratio of outer to inner track diameter, * which is typically around 2:1. We account for this by assigning * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). * In effect, this means that we'll select the metaslab with the most * free bandwidth rather than simply the one with the most free space. */ if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; ASSERT(weight >= space && weight <= 2 * space); } /* * If this metaslab is one we're actively using, adjust its * weight to make it preferable to any inactive metaslab so * we'll polish it off. If the fragmentation on this metaslab * has exceed our threshold, then don't mark it active. */ if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); } WEIGHT_SET_SPACEBASED(weight); return (weight); } /* * Return the weight of the specified metaslab, according to the segment-based * weighting algorithm. The metaslab must be loaded. This function can * be called within a sync pass since it relies only on the metaslab's * range tree which is always accurate when the metaslab is loaded. */ static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp) { uint64_t weight = 0; uint32_t segments = 0; ASSERT(msp->ms_loaded); for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; i--) { uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; segments <<= 1; segments += msp->ms_allocatable->rt_histogram[i]; /* * The range tree provides more precision than the space map * and must be downgraded so that all values fit within the * space map's histogram. This allows us to compare loaded * vs. unloaded metaslabs to determine which metaslab is * considered "best". */ if (i > max_idx) continue; if (segments != 0) { WEIGHT_SET_COUNT(weight, segments); WEIGHT_SET_INDEX(weight, i); WEIGHT_SET_ACTIVE(weight, 0); break; } } return (weight); } /* * Calculate the weight based on the on-disk histogram. Should be applied * only to unloaded metaslabs (i.e no incoming allocations) in-order to * give results consistent with the on-disk state */ static uint64_t metaslab_weight_from_spacemap(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; ASSERT(!msp->ms_loaded); ASSERT(sm != NULL); ASSERT3U(space_map_object(sm), !=, 0); ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); /* * Create a joint histogram from all the segments that have made * it to the metaslab's space map histogram, that are not yet * available for allocation because they are still in the freeing * pipeline (e.g. freeing, freed, and defer trees). Then subtract * these segments from the space map's histogram to get a more * accurate weight. */ uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) deferspace_histogram[i] += msp->ms_synchist[i]; for (int t = 0; t < TXG_DEFER_SIZE; t++) { for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { deferspace_histogram[i] += msp->ms_deferhist[t][i]; } } uint64_t weight = 0; for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { ASSERT3U(sm->sm_phys->smp_histogram[i], >=, deferspace_histogram[i]); uint64_t count = sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; if (count != 0) { WEIGHT_SET_COUNT(weight, count); WEIGHT_SET_INDEX(weight, i + sm->sm_shift); WEIGHT_SET_ACTIVE(weight, 0); break; } } return (weight); } /* * Compute a segment-based weight for the specified metaslab. The weight * is determined by highest bucket in the histogram. The information * for the highest bucket is encoded into the weight value. */ static uint64_t metaslab_segment_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; uint64_t weight = 0; uint8_t shift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The metaslab is completely free. */ if (metaslab_allocated_space(msp) == 0) { int idx = highbit64(msp->ms_size) - 1; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; if (idx < max_idx) { WEIGHT_SET_COUNT(weight, 1ULL); WEIGHT_SET_INDEX(weight, idx); } else { WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); WEIGHT_SET_INDEX(weight, max_idx); } WEIGHT_SET_ACTIVE(weight, 0); ASSERT(!WEIGHT_IS_SPACEBASED(weight)); return (weight); } ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); /* * If the metaslab is fully allocated then just make the weight 0. */ if (metaslab_allocated_space(msp) == msp->ms_size) return (0); /* * If the metaslab is already loaded, then use the range tree to * determine the weight. Otherwise, we rely on the space map information * to generate the weight. */ if (msp->ms_loaded) { weight = metaslab_weight_from_range_tree(msp); } else { weight = metaslab_weight_from_spacemap(msp); } /* * If the metaslab was active the last time we calculated its weight * then keep it active. We want to consume the entire region that * is associated with this weight. */ if (msp->ms_activation_weight != 0 && weight != 0) WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); return (weight); } /* * Determine if we should attempt to allocate from this metaslab. If the * metaslab is loaded, then we can determine if the desired allocation * can be satisfied by looking at the size of the maximum free segment * on that metaslab. Otherwise, we make our decision based on the metaslab's * weight. For segment-based weighting we can determine the maximum * allocation based on the index encoded in its value. For space-based * weights we rely on the entire weight (excluding the weight-type bit). */ static boolean_t metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) { /* * This case will usually but not always get caught by the checks below; * metaslabs can be loaded by various means, including the trim and * initialize code. Once that happens, without this check they are * allocatable even before they finish their first txg sync. */ if (unlikely(msp->ms_new)) return (B_FALSE); /* * If the metaslab is loaded, ms_max_size is definitive and we can use * the fast check. If it's not, the ms_max_size is a lower bound (once * set), and we should use the fast check as long as we're not in * try_hard and it's been less than zfs_metaslab_max_size_cache_sec * seconds since the metaslab was unloaded. */ if (msp->ms_loaded || (msp->ms_max_size != 0 && !try_hard && gethrtime() < msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec))) return (msp->ms_max_size >= asize); boolean_t should_allocate; if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { /* * The metaslab segment weight indicates segments in the * range [2^i, 2^(i+1)), where i is the index in the weight. * Since the asize might be in the middle of the range, we * should attempt the allocation if asize < 2^(i+1). */ should_allocate = (asize < 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); } else { should_allocate = (asize <= (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); } return (should_allocate); } static uint64_t metaslab_weight(metaslab_t *msp, boolean_t nodirty) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; uint64_t weight; ASSERT(MUTEX_HELD(&msp->ms_lock)); metaslab_set_fragmentation(msp, nodirty); /* * Update the maximum size. If the metaslab is loaded, this will * ensure that we get an accurate maximum size if newly freed space * has been added back into the free tree. If the metaslab is * unloaded, we check if there's a larger free segment in the * unflushed frees. This is a lower bound on the largest allocatable * segment size. Coalescing of adjacent entries may reveal larger * allocatable segments, but we aren't aware of those until loading * the space map into a range tree. */ if (msp->ms_loaded) { msp->ms_max_size = metaslab_largest_allocatable(msp); } else { msp->ms_max_size = MAX(msp->ms_max_size, metaslab_largest_unflushed_free(msp)); } /* * Segment-based weighting requires space map histogram support. */ if (zfs_metaslab_segment_weight_enabled && spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == sizeof (space_map_phys_t))) { weight = metaslab_segment_weight(msp); } else { weight = metaslab_space_weight(msp); } return (weight); } void metaslab_recalculate_weight_and_sort(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* note: we preserve the mask (e.g. indication of primary, etc..) */ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; metaslab_group_sort(msp->ms_group, msp, metaslab_weight(msp, B_FALSE) | was_active); } static int metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, int allocator, uint64_t activation_weight) { metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * If we're activating for the claim code, we don't want to actually * set the metaslab up for a specific allocator. */ if (activation_weight == METASLAB_WEIGHT_CLAIM) { ASSERT0(msp->ms_activation_weight); msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort(mg, msp, msp->ms_weight | activation_weight); return (0); } metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ? &mga->mga_primary : &mga->mga_secondary); mutex_enter(&mg->mg_lock); if (*mspp != NULL) { mutex_exit(&mg->mg_lock); return (EEXIST); } *mspp = msp; ASSERT3S(msp->ms_allocator, ==, -1); msp->ms_allocator = allocator; msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); ASSERT0(msp->ms_activation_weight); msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort_impl(mg, msp, msp->ms_weight | activation_weight); mutex_exit(&mg->mg_lock); return (0); } static int metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The current metaslab is already activated for us so there * is nothing to do. Already activated though, doesn't mean * that this metaslab is activated for our allocator nor our * requested activation weight. The metaslab could have started * as an active one for our allocator but changed allocators * while we were waiting to grab its ms_lock or we stole it * [see find_valid_metaslab()]. This means that there is a * possibility of passivating a metaslab of another allocator * or from a different activation mask, from this thread. */ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { ASSERT(msp->ms_loaded); return (0); } int error = metaslab_load(msp); if (error != 0) { metaslab_group_sort(msp->ms_group, msp, 0); return (error); } /* * When entering metaslab_load() we may have dropped the * ms_lock because we were loading this metaslab, or we * were waiting for another thread to load it for us. In * that scenario, we recheck the weight of the metaslab * to see if it was activated by another thread. * * If the metaslab was activated for another allocator or * it was activated with a different activation weight (e.g. * we wanted to make it a primary but it was activated as * secondary) we return error (EBUSY). * * If the metaslab was activated for the same allocator * and requested activation mask, skip activating it. */ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { if (msp->ms_allocator != allocator) return (EBUSY); if ((msp->ms_weight & activation_weight) == 0) return (SET_ERROR(EBUSY)); EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY), msp->ms_primary); return (0); } /* * If the metaslab has literally 0 space, it will have weight 0. In * that case, don't bother activating it. This can happen if the * metaslab had space during find_valid_metaslab, but another thread * loaded it and used all that space while we were waiting to grab the * lock. */ if (msp->ms_weight == 0) { ASSERT0(range_tree_space(msp->ms_allocatable)); return (SET_ERROR(ENOSPC)); } if ((error = metaslab_activate_allocator(msp->ms_group, msp, allocator, activation_weight)) != 0) { return (error); } ASSERT(msp->ms_loaded); ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); return (0); } static void metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { metaslab_group_sort(mg, msp, weight); return; } mutex_enter(&mg->mg_lock); ASSERT3P(msp->ms_group, ==, mg); ASSERT3S(0, <=, msp->ms_allocator); ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator]; if (msp->ms_primary) { ASSERT3P(mga->mga_primary, ==, msp); ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); mga->mga_primary = NULL; } else { ASSERT3P(mga->mga_secondary, ==, msp); ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); mga->mga_secondary = NULL; } msp->ms_allocator = -1; metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } static void metaslab_passivate(metaslab_t *msp, uint64_t weight) { uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE; /* * If size < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. In that case, it had better be empty, * or we would be leaving space on the table. */ ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) || size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_allocatable) == 0); ASSERT0(weight & METASLAB_ACTIVE_MASK); ASSERT(msp->ms_activation_weight != 0); msp->ms_activation_weight = 0; metaslab_passivate_allocator(msp->ms_group, msp, weight); ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK); } /* * Segment-based metaslabs are activated once and remain active until * we either fail an allocation attempt (similar to space-based metaslabs) * or have exhausted the free space in zfs_metaslab_switch_threshold * buckets since the metaslab was activated. This function checks to see * if we've exhausted the zfs_metaslab_switch_threshold buckets in the * metaslab and passivates it proactively. This will allow us to select a * metaslab with a larger contiguous region, if any, remaining within this * metaslab group. If we're in sync pass > 1, then we continue using this * metaslab so that we don't dirty more block and cause more sync passes. */ static void metaslab_segment_may_passivate(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) return; /* * Since we are in the middle of a sync pass, the most accurate * information that is accessible to us is the in-core range tree * histogram; calculate the new weight based on that information. */ uint64_t weight = metaslab_weight_from_range_tree(msp); int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); int current_idx = WEIGHT_GET_INDEX(weight); if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) metaslab_passivate(msp, weight); } static void metaslab_preload(void *arg) { metaslab_t *msp = arg; metaslab_class_t *mc = msp->ms_group->mg_class; spa_t *spa = mc->mc_spa; fstrans_cookie_t cookie = spl_fstrans_mark(); ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); mutex_enter(&msp->ms_lock); (void) metaslab_load(msp); metaslab_set_selected_txg(msp, spa_syncing_txg(spa)); mutex_exit(&msp->ms_lock); spl_fstrans_unmark(cookie); } static void metaslab_group_preload(metaslab_group_t *mg) { spa_t *spa = mg->mg_vd->vdev_spa; metaslab_t *msp; avl_tree_t *t = &mg->mg_metaslab_tree; int m = 0; - if (spa_shutting_down(spa) || !metaslab_preload_enabled) { - taskq_wait_outstanding(mg->mg_taskq, 0); + if (spa_shutting_down(spa) || !metaslab_preload_enabled) return; - } mutex_enter(&mg->mg_lock); /* * Load the next potential metaslabs */ for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { ASSERT3P(msp->ms_group, ==, mg); /* * We preload only the maximum number of metaslabs specified * by metaslab_preload_limit. If a metaslab is being forced * to condense then we preload it too. This will ensure * that force condensing happens in the next txg. */ if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { continue; } - VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, - msp, TQ_SLEEP) != TASKQID_INVALID); + VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload, + msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0)) + != TASKQID_INVALID); } mutex_exit(&mg->mg_lock); } /* * Determine if the space map's on-disk footprint is past our tolerance for * inefficiency. We would like to use the following criteria to make our * decision: * * 1. Do not condense if the size of the space map object would dramatically * increase as a result of writing out the free space range tree. * * 2. Condense if the on on-disk space map representation is at least * zfs_condense_pct/100 times the size of the optimal representation * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB). * * 3. Do not condense if the on-disk size of the space map does not actually * decrease. * * Unfortunately, we cannot compute the on-disk size of the space map in this * context because we cannot accurately compute the effects of compression, etc. * Instead, we apply the heuristic described in the block comment for * zfs_metaslab_condense_block_threshold - we only condense if the space used * is greater than a threshold number of blocks. */ static boolean_t metaslab_should_condense(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; vdev_t *vd = msp->ms_group->mg_vd; uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); ASSERT(sm != NULL); ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1); /* * We always condense metaslabs that are empty and metaslabs for * which a condense request has been made. */ if (range_tree_numsegs(msp->ms_allocatable) == 0 || msp->ms_condense_wanted) return (B_TRUE); uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize); uint64_t object_size = space_map_length(sm); uint64_t optimal_size = space_map_estimate_optimal_size(sm, msp->ms_allocatable, SM_NO_VDEVID); return (object_size >= (optimal_size * zfs_condense_pct / 100) && object_size > zfs_metaslab_condense_block_threshold * record_size); } /* * Condense the on-disk space map representation to its minimized form. * The minimized form consists of a small number of allocations followed * by the entries of the free range tree (ms_allocatable). The condensed * spacemap contains all the entries of previous TXGs (including those in * the pool-wide log spacemaps; thus this is effectively a superset of * metaslab_flush()), but this TXG's entries still need to be written. */ static void metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) { range_tree_t *condense_tree; space_map_t *sm = msp->ms_sm; uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); ASSERT(msp->ms_sm != NULL); /* * In order to condense the space map, we need to change it so it * only describes which segments are currently allocated and free. * * All the current free space resides in the ms_allocatable, all * the ms_defer trees, and all the ms_allocating trees. We ignore * ms_freed because it is empty because we're in sync pass 1. We * ignore ms_freeing because these changes are not yet reflected * in the spacemap (they will be written later this txg). * * So to truncate the space map to represent all the entries of * previous TXGs we do the following: * * 1] We create a range tree (condense tree) that is 100% empty. * 2] We add to it all segments found in the ms_defer trees * as those segments are marked as free in the original space * map. We do the same with the ms_allocating trees for the same * reason. Adding these segments should be a relatively * inexpensive operation since we expect these trees to have a * small number of nodes. * 3] We vacate any unflushed allocs, since they are not frees we * need to add to the condense tree. Then we vacate any * unflushed frees as they should already be part of ms_allocatable. * 4] At this point, we would ideally like to add all segments * in the ms_allocatable tree from the condense tree. This way * we would write all the entries of the condense tree as the * condensed space map, which would only contain freed * segments with everything else assumed to be allocated. * * Doing so can be prohibitively expensive as ms_allocatable can * be large, and therefore computationally expensive to add to * the condense_tree. Instead we first sync out an entry marking * everything as allocated, then the condense_tree and then the * ms_allocatable, in the condensed space map. While this is not * optimal, it is typically close to optimal and more importantly * much cheaper to compute. * * 5] Finally, as both of the unflushed trees were written to our * new and condensed metaslab space map, we basically flushed * all the unflushed changes to disk, thus we call * metaslab_flush_update(). */ ASSERT3U(spa_sync_pass(spa), ==, 1); ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, " "spa %s, smp size %llu, segments %llu, forcing condense=%s", (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp, (u_longlong_t)msp->ms_group->mg_vd->vdev_id, spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm), (u_longlong_t)range_tree_numsegs(msp->ms_allocatable), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; range_seg_type_t type; uint64_t shift, start; type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, &start, &shift); condense_tree = range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], range_tree_add, condense_tree); } for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], range_tree_add, condense_tree); } ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); /* * We're about to drop the metaslab's lock thus allowing other * consumers to change it's content. Set the metaslab's ms_condensing * flag to ensure that allocations on this metaslab do not occur * while we're in the middle of committing it to disk. This is only * critical for ms_allocatable as all other range trees use per TXG * views of their content. */ msp->ms_condensing = B_TRUE; mutex_exit(&msp->ms_lock); uint64_t object = space_map_object(msp->ms_sm); space_map_truncate(sm, spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); /* * space_map_truncate() may have reallocated the spacemap object. * If so, update the vdev_ms_array. */ if (space_map_object(msp->ms_sm) != object) { object = space_map_object(msp->ms_sm); dmu_write(spa->spa_meta_objset, msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) * msp->ms_id, sizeof (uint64_t), &object, tx); } /* * Note: * When the log space map feature is enabled, each space map will * always have ALLOCS followed by FREES for each sync pass. This is * typically true even when the log space map feature is disabled, * except from the case where a metaslab goes through metaslab_sync() * and gets condensed. In that case the metaslab's space map will have * ALLOCS followed by FREES (due to condensing) followed by ALLOCS * followed by FREES (due to space_map_write() in metaslab_sync()) for * sync pass 1. */ range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start, shift); range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx); range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); range_tree_vacate(tmp_tree, NULL, NULL); range_tree_destroy(tmp_tree); mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; metaslab_flush_update(msp, tx); } static void metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(spa_syncing_log_sm(spa) != NULL); ASSERT(msp->ms_sm != NULL); ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); mutex_enter(&spa->spa_flushed_ms_lock); metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); metaslab_set_unflushed_dirty(msp, B_TRUE); avl_add(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); spa_log_sm_increment_current_mscount(spa); spa_log_summary_add_flushed_metaslab(spa, B_TRUE); } void metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(spa_syncing_log_sm(spa) != NULL); ASSERT(msp->ms_sm != NULL); ASSERT(metaslab_unflushed_txg(msp) != 0); ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); /* update metaslab's position in our flushing tree */ uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp); mutex_enter(&spa->spa_flushed_ms_lock); avl_remove(&spa->spa_metaslabs_by_flushed, msp); metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); metaslab_set_unflushed_dirty(msp, dirty); avl_add(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); /* update metaslab counts of spa_log_sm_t nodes */ spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); spa_log_sm_increment_current_mscount(spa); /* update log space map summary */ spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg, ms_prev_flushed_dirty); spa_log_summary_add_flushed_metaslab(spa, dirty); /* cleanup obsolete logs if any */ spa_cleanup_old_sm_logs(spa, tx); } /* * Called when the metaslab has been flushed (its own spacemap now reflects * all the contents of the pool-wide spacemap log). Updates the metaslab's * metadata and any pool-wide related log space map data (e.g. summary, * obsolete logs, etc..) to reflect that. */ static void metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) { metaslab_group_t *mg = msp->ms_group; spa_t *spa = mg->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(spa_sync_pass(spa), ==, 1); /* * Just because a metaslab got flushed, that doesn't mean that * it will pass through metaslab_sync_done(). Thus, make sure to * update ms_synced_length here in case it doesn't. */ msp->ms_synced_length = space_map_length(msp->ms_sm); /* * We may end up here from metaslab_condense() without the * feature being active. In that case this is a no-op. */ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) || metaslab_unflushed_txg(msp) == 0) return; metaslab_unflushed_bump(msp, tx, B_FALSE); } boolean_t metaslab_flush(metaslab_t *msp, dmu_tx_t *tx) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(spa_sync_pass(spa), ==, 1); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); ASSERT(msp->ms_sm != NULL); ASSERT(metaslab_unflushed_txg(msp) != 0); ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL); /* * There is nothing wrong with flushing the same metaslab twice, as * this codepath should work on that case. However, the current * flushing scheme makes sure to avoid this situation as we would be * making all these calls without having anything meaningful to write * to disk. We assert this behavior here. */ ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx)); /* * We can not flush while loading, because then we would * not load the ms_unflushed_{allocs,frees}. */ if (msp->ms_loading) return (B_FALSE); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); /* * Metaslab condensing is effectively flushing. Therefore if the * metaslab can be condensed we can just condense it instead of * flushing it. * * Note that metaslab_condense() does call metaslab_flush_update() * so we can just return immediately after condensing. We also * don't need to care about setting ms_flushing or broadcasting * ms_flush_cv, even if we temporarily drop the ms_lock in * metaslab_condense(), as the metaslab is already loaded. */ if (msp->ms_loaded && metaslab_should_condense(msp)) { metaslab_group_t *mg = msp->ms_group; /* * For all histogram operations below refer to the * comments of metaslab_sync() where we follow a * similar procedure. */ metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); metaslab_condense(msp, tx); space_map_histogram_clear(msp->ms_sm); space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); ASSERT(range_tree_is_empty(msp->ms_freed)); for (int t = 0; t < TXG_DEFER_SIZE; t++) { space_map_histogram_add(msp->ms_sm, msp->ms_defer[t], tx); } metaslab_aux_histograms_update(msp); metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); /* * Since we recreated the histogram (and potentially * the ms_sm too while condensing) ensure that the * weight is updated too because we are not guaranteed * that this metaslab is dirty and will go through * metaslab_sync_done(). */ metaslab_recalculate_weight_and_sort(msp); return (B_TRUE); } msp->ms_flushing = B_TRUE; uint64_t sm_len_before = space_map_length(msp->ms_sm); mutex_exit(&msp->ms_lock); space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); uint64_t sm_len_after = space_map_length(msp->ms_sm); if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, " "appended %llu bytes", (u_longlong_t)dmu_tx_get_txg(tx), spa_name(spa), (u_longlong_t)msp->ms_group->mg_vd->vdev_id, (u_longlong_t)msp->ms_id, (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs), (u_longlong_t)range_tree_space(msp->ms_unflushed_frees), (u_longlong_t)(sm_len_after - sm_len_before)); } ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); metaslab_flush_update(msp, tx); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); msp->ms_flushing = B_FALSE; cv_broadcast(&msp->ms_flush_cv); return (B_TRUE); } /* * Write a metaslab to disk in the context of the specified transaction group. */ void metaslab_sync(metaslab_t *msp, uint64_t txg) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; dmu_tx_t *tx; ASSERT(!vd->vdev_ishole); /* * This metaslab has just been added so there's no work to do now. */ if (msp->ms_new) { ASSERT0(range_tree_space(alloctree)); ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_freed)); ASSERT0(range_tree_space(msp->ms_checkpointing)); ASSERT0(range_tree_space(msp->ms_trim)); return; } /* * Normally, we don't want to process a metaslab if there are no * allocations or frees to perform. However, if the metaslab is being * forced to condense, it's loaded and we're not beyond the final * dirty txg, we need to let it through. Not condensing beyond the * final dirty txg prevents an issue where metaslabs that need to be * condensed but were loaded for other reasons could cause a panic * here. By only checking the txg in that branch of the conditional, * we preserve the utility of the VERIFY statements in all other * cases. */ if (range_tree_is_empty(alloctree) && range_tree_is_empty(msp->ms_freeing) && range_tree_is_empty(msp->ms_checkpointing) && !(msp->ms_loaded && msp->ms_condense_wanted && txg <= spa_final_dirty_txg(spa))) return; VERIFY3U(txg, <=, spa_final_dirty_txg(spa)); /* * The only state that can actually be changing concurrently * with metaslab_sync() is the metaslab's ms_allocatable. No * other thread can be modifying this txg's alloc, freeing, * freed, or space_map_phys_t. We drop ms_lock whenever we * could call into the DMU, because the DMU can call down to * us (e.g. via zio_free()) at any time. * * The spa_vdev_remove_thread() can be reading metaslab state * concurrently, and it is locked out by the ms_sync_lock. * Note that the ms_lock is insufficient for this, because it * is dropped by space_map_write(). */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); /* * Generate a log space map if one doesn't exist already. */ spa_generate_syncing_log_sm(spa, tx); if (msp->ms_sm == NULL) { uint64_t new_object = space_map_alloc(mos, spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); VERIFY3U(new_object, !=, 0); dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * msp->ms_id, sizeof (uint64_t), &new_object, tx); VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, msp->ms_start, msp->ms_size, vd->vdev_ashift)); ASSERT(msp->ms_sm != NULL); ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); ASSERT0(metaslab_allocated_space(msp)); } if (!range_tree_is_empty(msp->ms_checkpointing) && vd->vdev_checkpoint_sm == NULL) { ASSERT(spa_has_checkpoint(spa)); uint64_t new_object = space_map_alloc(mos, zfs_vdev_standard_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * We save the space map object as an entry in vdev_top_zap * so it can be retrieved when the pool is reopened after an * export or through zdb. */ VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (new_object), 1, &new_object, tx)); } mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); /* * Note: metaslab_condense() clears the space map's histogram. * Therefore we must verify and remove this histogram before * condensing. */ metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); if (spa->spa_sync_pass == 1 && msp->ms_loaded && metaslab_should_condense(msp)) metaslab_condense(msp, tx); /* * We'll be going to disk to sync our space accounting, thus we * drop the ms_lock during that time so allocations coming from * open-context (ZIL) for future TXGs do not block. */ mutex_exit(&msp->ms_lock); space_map_t *log_sm = spa_syncing_log_sm(spa); if (log_sm != NULL) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); if (metaslab_unflushed_txg(msp) == 0) metaslab_unflushed_add(msp, tx); else if (!metaslab_unflushed_dirty(msp)) metaslab_unflushed_bump(msp, tx, B_TRUE); space_map_write(log_sm, alloctree, SM_ALLOC, vd->vdev_id, tx); space_map_write(log_sm, msp->ms_freeing, SM_FREE, vd->vdev_id, tx); mutex_enter(&msp->ms_lock); ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); range_tree_remove_xor_add(alloctree, msp->ms_unflushed_frees, msp->ms_unflushed_allocs); range_tree_remove_xor_add(msp->ms_freeing, msp->ms_unflushed_allocs, msp->ms_unflushed_frees); spa->spa_unflushed_stats.sus_memused += metaslab_unflushed_changes_memused(msp); } else { ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); space_map_write(msp->ms_sm, alloctree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); } msp->ms_allocated_space += range_tree_space(alloctree); ASSERT3U(msp->ms_allocated_space, >=, range_tree_space(msp->ms_freeing)); msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); if (!range_tree_is_empty(msp->ms_checkpointing)) { ASSERT(spa_has_checkpoint(spa)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since we are doing writes to disk and the ms_checkpointing * tree won't be changing during that time, we drop the * ms_lock while writing to the checkpoint space map, for the * same reason mentioned above. */ mutex_exit(&msp->ms_lock); space_map_write(vd->vdev_checkpoint_sm, msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); spa->spa_checkpoint_info.sci_dspace += range_tree_space(msp->ms_checkpointing); vd->vdev_stat.vs_checkpoint_space += range_tree_space(msp->ms_checkpointing); ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, -space_map_allocated(vd->vdev_checkpoint_sm)); range_tree_vacate(msp->ms_checkpointing, NULL, NULL); } if (msp->ms_loaded) { /* * When the space map is loaded, we have an accurate * histogram in the range tree. This gives us an opportunity * to bring the space map's histogram up-to-date so we clear * it first before updating it. */ space_map_histogram_clear(msp->ms_sm); space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); /* * Since we've cleared the histogram we need to add back * any free space that has already been processed, plus * any deferred space. This allows the on-disk histogram * to accurately reflect all free space even if some space * is not yet available for allocation (i.e. deferred). */ space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); /* * Add back any deferred free space that has not been * added back into the in-core free tree yet. This will * ensure that we don't end up with a space map histogram * that is completely empty unless the metaslab is fully * allocated. */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { space_map_histogram_add(msp->ms_sm, msp->ms_defer[t], tx); } } /* * Always add the free space from this sync pass to the space * map histogram. We want to make sure that the on-disk histogram * accounts for all free space. If the space map is not loaded, * then we will lose some accuracy but will correct it the next * time we load the space map. */ space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); metaslab_aux_histograms_update(msp); metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); /* * For sync pass 1, we avoid traversing this txg's free range tree * and instead will just swap the pointers for freeing and freed. * We can safely do this since the freed_tree is guaranteed to be * empty on the initial pass. * * Keep in mind that even if we are currently using a log spacemap * we want current frees to end up in the ms_allocatable (but not * get appended to the ms_sm) so their ranges can be reused as usual. */ if (spa_sync_pass(spa) == 1) { range_tree_swap(&msp->ms_freeing, &msp->ms_freed); ASSERT0(msp->ms_allocated_this_txg); } else { range_tree_vacate(msp->ms_freeing, range_tree_add, msp->ms_freed); } msp->ms_allocated_this_txg += range_tree_space(alloctree); range_tree_vacate(alloctree, NULL, NULL); ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_checkpointing)); mutex_exit(&msp->ms_lock); /* * Verify that the space map object ID has been recorded in the * vdev_ms_array. */ uint64_t object; VERIFY0(dmu_read(mos, vd->vdev_ms_array, msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0)); VERIFY3U(object, ==, space_map_object(msp->ms_sm)); mutex_exit(&msp->ms_sync_lock); dmu_tx_commit(tx); } static void metaslab_evict(metaslab_t *msp, uint64_t txg) { if (!msp->ms_loaded || msp->ms_disabled != 0) return; for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { VERIFY0(range_tree_space( msp->ms_allocating[(txg + t) & TXG_MASK])); } if (msp->ms_allocator != -1) metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); if (!metaslab_debug_unload) metaslab_unload(msp); } /* * Called after a transaction group has completely synced to mark * all of the metaslab's free space as usable. */ void metaslab_sync_done(metaslab_t *msp, uint64_t txg) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; range_tree_t **defer_tree; int64_t alloc_delta, defer_delta; boolean_t defer_allowed = B_TRUE; ASSERT(!vd->vdev_ishole); mutex_enter(&msp->ms_lock); if (msp->ms_new) { /* this is a new metaslab, add its capacity to the vdev */ metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); /* there should be no allocations nor frees at this point */ VERIFY0(msp->ms_allocated_this_txg); VERIFY0(range_tree_space(msp->ms_freed)); } ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_checkpointing)); defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - metaslab_class_get_alloc(spa_normal_class(spa)); if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { defer_allowed = B_FALSE; } defer_delta = 0; alloc_delta = msp->ms_allocated_this_txg - range_tree_space(msp->ms_freed); if (defer_allowed) { defer_delta = range_tree_space(msp->ms_freed) - range_tree_space(*defer_tree); } else { defer_delta -= range_tree_space(*defer_tree); } metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, defer_delta, 0); if (spa_syncing_log_sm(spa) == NULL) { /* * If there's a metaslab_load() in progress and we don't have * a log space map, it means that we probably wrote to the * metaslab's space map. If this is the case, we need to * make sure that we wait for the load to complete so that we * have a consistent view at the in-core side of the metaslab. */ metaslab_load_wait(msp); } else { ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); } /* * When auto-trimming is enabled, free ranges which are added to * ms_allocatable are also be added to ms_trim. The ms_trim tree is * periodically consumed by the vdev_autotrim_thread() which issues * trims for all ranges and then vacates the tree. The ms_trim tree * can be discarded at any time with the sole consequence of recent * frees not being trimmed. */ if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) { range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim); if (!defer_allowed) { range_tree_walk(msp->ms_freed, range_tree_add, msp->ms_trim); } } else { range_tree_vacate(msp->ms_trim, NULL, NULL); } /* * Move the frees from the defer_tree back to the free * range tree (if it's loaded). Swap the freed_tree and * the defer_tree -- this is safe to do because we've * just emptied out the defer_tree. */ range_tree_vacate(*defer_tree, msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); if (defer_allowed) { range_tree_swap(&msp->ms_freed, defer_tree); } else { range_tree_vacate(msp->ms_freed, msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); } msp->ms_synced_length = space_map_length(msp->ms_sm); msp->ms_deferspace += defer_delta; ASSERT3S(msp->ms_deferspace, >=, 0); ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); if (msp->ms_deferspace != 0) { /* * Keep syncing this metaslab until all deferred frees * are back in circulation. */ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); } metaslab_aux_histograms_update_done(msp, defer_allowed); if (msp->ms_new) { msp->ms_new = B_FALSE; mutex_enter(&mg->mg_lock); mg->mg_ms_ready++; mutex_exit(&mg->mg_lock); } /* * Re-sort metaslab within its group now that we've adjusted * its allocatable space. */ metaslab_recalculate_weight_and_sort(msp); ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_freed)); ASSERT0(range_tree_space(msp->ms_checkpointing)); msp->ms_allocating_total -= msp->ms_allocated_this_txg; msp->ms_allocated_this_txg = 0; mutex_exit(&msp->ms_lock); } void metaslab_sync_reassess(metaslab_group_t *mg) { spa_t *spa = mg->mg_class->mc_spa; spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); metaslab_group_alloc_update(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg); /* * Preload the next potential metaslabs but only on active * metaslab groups. We can get into a state where the metaslab * is no longer active since we dirty metaslabs as we remove a * a device, thus potentially making the metaslab group eligible * for preloading. */ if (mg->mg_activation_count > 0) { metaslab_group_preload(mg); } spa_config_exit(spa, SCL_ALLOC, FTAG); } /* * When writing a ditto block (i.e. more than one DVA for a given BP) on * the same vdev as an existing DVA of this BP, then try to allocate it * on a different metaslab than existing DVAs (i.e. a unique metaslab). */ static boolean_t metaslab_is_unique(metaslab_t *msp, dva_t *dva) { uint64_t dva_ms_id; if (DVA_GET_ASIZE(dva) == 0) return (B_TRUE); if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) return (B_TRUE); dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; return (msp->ms_id != dva_ms_id); } /* * ========================================================================== * Metaslab allocation tracing facility * ========================================================================== */ /* * Add an allocation trace element to the allocation tracing list. */ static void metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, int allocator) { metaslab_alloc_trace_t *mat; if (!metaslab_trace_enabled) return; /* * When the tracing list reaches its maximum we remove * the second element in the list before adding a new one. * By removing the second element we preserve the original * entry as a clue to what allocations steps have already been * performed. */ if (zal->zal_size == metaslab_trace_max_entries) { metaslab_alloc_trace_t *mat_next; #ifdef ZFS_DEBUG panic("too many entries in allocation list"); #endif METASLABSTAT_BUMP(metaslabstat_trace_over_limit); zal->zal_size--; mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); list_remove(&zal->zal_list, mat_next); kmem_cache_free(metaslab_alloc_trace_cache, mat_next); } mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); list_link_init(&mat->mat_list_node); mat->mat_mg = mg; mat->mat_msp = msp; mat->mat_size = psize; mat->mat_dva_id = dva_id; mat->mat_offset = offset; mat->mat_weight = 0; mat->mat_allocator = allocator; if (msp != NULL) mat->mat_weight = msp->ms_weight; /* * The list is part of the zio so locking is not required. Only * a single thread will perform allocations for a given zio. */ list_insert_tail(&zal->zal_list, mat); zal->zal_size++; ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); } void metaslab_trace_init(zio_alloc_list_t *zal) { list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), offsetof(metaslab_alloc_trace_t, mat_list_node)); zal->zal_size = 0; } void metaslab_trace_fini(zio_alloc_list_t *zal) { metaslab_alloc_trace_t *mat; while ((mat = list_remove_head(&zal->zal_list)) != NULL) kmem_cache_free(metaslab_alloc_trace_cache, mat); list_destroy(&zal->zal_list); zal->zal_size = 0; } /* * ========================================================================== * Metaslab block operations * ========================================================================== */ static void metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, const void *tag, int flags, int allocator) { if (!(flags & METASLAB_ASYNC_ALLOC) || (flags & METASLAB_DONT_THROTTLE)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; (void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag); } static void metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) { metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; metaslab_class_allocator_t *mca = &mg->mg_class->mc_allocator[allocator]; uint64_t max = mg->mg_max_alloc_queue_depth; uint64_t cur = mga->mga_cur_max_alloc_queue_depth; while (cur < max) { if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth, cur, cur + 1) == cur) { atomic_inc_64(&mca->mca_alloc_max_slots); return; } cur = mga->mga_cur_max_alloc_queue_depth; } } void metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, const void *tag, int flags, int allocator, boolean_t io_complete) { if (!(flags & METASLAB_ASYNC_ALLOC) || (flags & METASLAB_DONT_THROTTLE)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; (void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag); if (io_complete) metaslab_group_increment_qdepth(mg, allocator); } void metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, const void *tag, int allocator) { #ifdef ZFS_DEBUG const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); for (int d = 0; d < ndvas; d++) { uint64_t vdev = DVA_GET_VDEV(&dva[d]); metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag)); } #endif } static uint64_t metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) { uint64_t start; range_tree_t *rt = msp->ms_allocatable; metaslab_class_t *mc = msp->ms_group->mg_class; ASSERT(MUTEX_HELD(&msp->ms_lock)); VERIFY(!msp->ms_condensing); VERIFY0(msp->ms_disabled); start = mc->mc_ops->msop_alloc(msp, size); if (start != -1ULL) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); range_tree_remove(rt, start, size); range_tree_clear(msp->ms_trim, start, size); if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); msp->ms_allocating_total += size; /* Track the last successful allocation */ msp->ms_alloc_txg = txg; metaslab_verify_space(msp, txg); } /* * Now that we've attempted the allocation we need to update the * metaslab's maximum block size since it may have changed. */ msp->ms_max_size = metaslab_largest_allocatable(msp); return (start); } /* * Find the metaslab with the highest weight that is less than what we've * already tried. In the common case, this means that we will examine each * metaslab at most once. Note that concurrent callers could reorder metaslabs * by activation/passivation once we have dropped the mg_lock. If a metaslab is * activated by another thread, and we fail to allocate from the metaslab we * have selected, we may not try the newly-activated metaslab, and instead * activate another metaslab. This is not optimal, but generally does not cause * any problems (a possible exception being if every metaslab is completely full * except for the newly-activated metaslab which we fail to examine). */ static metaslab_t * find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) { avl_index_t idx; avl_tree_t *t = &mg->mg_metaslab_tree; metaslab_t *msp = avl_find(t, search, &idx); if (msp == NULL) msp = avl_nearest(t, idx, AVL_AFTER); uint_t tries = 0; for (; msp != NULL; msp = AVL_NEXT(t, msp)) { int i; if (!try_hard && tries > zfs_metaslab_find_max_tries) { METASLABSTAT_BUMP(metaslabstat_too_many_tries); return (NULL); } tries++; if (!metaslab_should_allocate(msp, asize, try_hard)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); continue; } /* * If the selected metaslab is condensing or disabled, * skip it. */ if (msp->ms_condensing || msp->ms_disabled > 0) continue; *was_active = msp->ms_allocator != -1; /* * If we're activating as primary, this is our first allocation * from this disk, so we don't need to check how close we are. * If the metaslab under consideration was already active, * we're getting desperate enough to steal another allocator's * metaslab, so we still don't care about distances. */ if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) break; for (i = 0; i < d; i++) { if (want_unique && !metaslab_is_unique(msp, &dva[i])) break; /* try another metaslab */ } if (i == d) break; } if (msp != NULL) { search->ms_weight = msp->ms_weight; search->ms_start = msp->ms_start + 1; search->ms_allocator = msp->ms_allocator; search->ms_primary = msp->ms_primary; } return (msp); } static void metaslab_active_mask_verify(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) return; if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) { VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); VERIFY3S(msp->ms_allocator, !=, -1); VERIFY(msp->ms_primary); return; } if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) { VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); VERIFY3S(msp->ms_allocator, !=, -1); VERIFY(!msp->ms_primary); return; } if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); VERIFY3S(msp->ms_allocator, ==, -1); return; } } static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, int allocator, boolean_t try_hard) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY; for (int i = 0; i < d; i++) { if (activation_weight == METASLAB_WEIGHT_PRIMARY && DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_SECONDARY; } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_CLAIM; break; } } /* * If we don't have enough metaslabs active to fill the entire array, we * just use the 0th slot. */ if (mg->mg_ms_ready < mg->mg_allocators * 3) allocator = 0; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); search->ms_weight = UINT64_MAX; search->ms_start = 0; /* * At the end of the metaslab tree are the already-active metaslabs, * first the primaries, then the secondaries. When we resume searching * through the tree, we need to consider ms_allocator and ms_primary so * we start in the location right after where we left off, and don't * accidentally loop forever considering the same metaslabs. */ search->ms_allocator = -1; search->ms_primary = B_TRUE; for (;;) { boolean_t was_active = B_FALSE; mutex_enter(&mg->mg_lock); if (activation_weight == METASLAB_WEIGHT_PRIMARY && mga->mga_primary != NULL) { msp = mga->mga_primary; /* * Even though we don't hold the ms_lock for the * primary metaslab, those fields should not * change while we hold the mg_lock. Thus it is * safe to make assertions on them. */ ASSERT(msp->ms_primary); ASSERT3S(msp->ms_allocator, ==, allocator); ASSERT(msp->ms_loaded); was_active = B_TRUE; ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && mga->mga_secondary != NULL) { msp = mga->mga_secondary; /* * See comment above about the similar assertions * for the primary metaslab. */ ASSERT(!msp->ms_primary); ASSERT3S(msp->ms_allocator, ==, allocator); ASSERT(msp->ms_loaded); was_active = B_TRUE; ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); } else { msp = find_valid_metaslab(mg, activation_weight, dva, d, want_unique, asize, allocator, try_hard, zal, search, &was_active); } mutex_exit(&mg->mg_lock); if (msp == NULL) { kmem_free(search, sizeof (*search)); return (-1ULL); } mutex_enter(&msp->ms_lock); metaslab_active_mask_verify(msp); /* * This code is disabled out because of issues with * tracepoints in non-gpl kernel modules. */ #if 0 DTRACE_PROBE3(ms__activation__attempt, metaslab_t *, msp, uint64_t, activation_weight, boolean_t, was_active); #endif /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that * another thread may have changed the weight while we * were blocked on the metaslab lock. We check the * active status first to see if we need to set_selected_txg * a new metaslab. */ if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { ASSERT3S(msp->ms_allocator, ==, -1); mutex_exit(&msp->ms_lock); continue; } /* * If the metaslab was activated for another allocator * while we were waiting in the ms_lock above, or it's * a primary and we're seeking a secondary (or vice versa), * we go back and select a new metaslab. */ if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && (msp->ms_allocator != -1) && (msp->ms_allocator != allocator || ((activation_weight == METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { ASSERT(msp->ms_loaded); ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) || msp->ms_allocator != -1); mutex_exit(&msp->ms_lock); continue; } /* * This metaslab was used for claiming regions allocated * by the ZIL during pool import. Once these regions are * claimed we don't need to keep the CLAIM bit set * anymore. Passivate this metaslab to zero its activation * mask. */ if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && activation_weight != METASLAB_WEIGHT_CLAIM) { ASSERT(msp->ms_loaded); ASSERT3S(msp->ms_allocator, ==, -1); metaslab_passivate(msp, msp->ms_weight & ~METASLAB_WEIGHT_CLAIM); mutex_exit(&msp->ms_lock); continue; } metaslab_set_selected_txg(msp, txg); int activation_error = metaslab_activate(msp, allocator, activation_weight); metaslab_active_mask_verify(msp); /* * If the metaslab was activated by another thread for * another allocator or activation_weight (EBUSY), or it * failed because another metaslab was assigned as primary * for this allocator (EEXIST) we continue using this * metaslab for our allocation, rather than going on to a * worse metaslab (we waited for that metaslab to be loaded * after all). * * If the activation failed due to an I/O error or ENOSPC we * skip to the next metaslab. */ boolean_t activated; if (activation_error == 0) { activated = B_TRUE; } else if (activation_error == EBUSY || activation_error == EEXIST) { activated = B_FALSE; } else { mutex_exit(&msp->ms_lock); continue; } ASSERT(msp->ms_loaded); /* * Now that we have the lock, recheck to see if we should * continue to use this metaslab for this allocation. The * the metaslab is now loaded so metaslab_should_allocate() * can accurately determine if the allocation attempt should * proceed. */ if (!metaslab_should_allocate(msp, asize, try_hard)) { /* Passivate this metaslab and select a new one. */ metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); goto next; } /* * If this metaslab is currently condensing then pick again * as we can't manipulate this metaslab until it's committed * to disk. If this metaslab is being initialized, we shouldn't * allocate from it since the allocated region might be * overwritten after allocation. */ if (msp->ms_condensing) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_CONDENSING, allocator); if (activated) { metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); } mutex_exit(&msp->ms_lock); continue; } else if (msp->ms_disabled > 0) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_DISABLED, allocator); if (activated) { metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); } mutex_exit(&msp->ms_lock); continue; } offset = metaslab_block_alloc(msp, asize, txg); metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); if (offset != -1ULL) { /* Proactively passivate the metaslab, if needed */ if (activated) metaslab_segment_may_passivate(msp); break; } next: ASSERT(msp->ms_loaded); /* * This code is disabled out because of issues with * tracepoints in non-gpl kernel modules. */ #if 0 DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp, uint64_t, asize); #endif /* * We were unable to allocate from this metaslab so determine * a new weight for this metaslab. Now that we have loaded * the metaslab we can provide a better hint to the metaslab * selector. * * For space-based metaslabs, we use the maximum block size. * This information is only available when the metaslab * is loaded and is more accurate than the generic free * space weight that was calculated by metaslab_weight(). * This information allows us to quickly compare the maximum * available allocation in the metaslab to the allocation * size being requested. * * For segment-based metaslabs, determine the new weight * based on the highest bucket in the range tree. We * explicitly use the loaded segment weight (i.e. the range * tree histogram) since it contains the space that is * currently available for allocation and is accurate * even within a sync pass. */ uint64_t weight; if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { weight = metaslab_largest_allocatable(msp); WEIGHT_SET_SPACEBASED(weight); } else { weight = metaslab_weight_from_range_tree(msp); } if (activated) { metaslab_passivate(msp, weight); } else { /* * For the case where we use the metaslab that is * active for another allocator we want to make * sure that we retain the activation mask. * * Note that we could attempt to use something like * metaslab_recalculate_weight_and_sort() that * retains the activation mask here. That function * uses metaslab_weight() to set the weight though * which is not as accurate as the calculations * above. */ weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; metaslab_group_sort(mg, msp, weight); } metaslab_active_mask_verify(msp); /* * We have just failed an allocation attempt, check * that metaslab_should_allocate() agrees. Otherwise, * we may end up in an infinite loop retrying the same * metaslab. */ ASSERT(!metaslab_should_allocate(msp, asize, try_hard)); mutex_exit(&msp->ms_lock); } mutex_exit(&msp->ms_lock); kmem_free(search, sizeof (*search)); return (offset); } static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, int allocator, boolean_t try_hard) { uint64_t offset; ASSERT(mg->mg_initialized); offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, dva, d, allocator, try_hard); mutex_enter(&mg->mg_lock); if (offset == -1ULL) { mg->mg_failed_allocations++; metaslab_trace_add(zal, mg, NULL, asize, d, TRACE_GROUP_FAILURE, allocator); if (asize == SPA_GANGBLOCKSIZE) { /* * This metaslab group was unable to allocate * the minimum gang block size so it must be out of * space. We must notify the allocation throttle * to start skipping allocation attempts to this * metaslab group until more space becomes available. * Note: this failure cannot be caused by the * allocation throttle since the allocation throttle * is only responsible for skipping devices and * not failing block allocations. */ mg->mg_no_free_space = B_TRUE; } } mg->mg_allocations++; mutex_exit(&mg->mg_lock); return (offset); } /* * Allocate a block for the specified i/o. */ int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, zio_alloc_list_t *zal, int allocator) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; metaslab_group_t *mg, *rotor; vdev_t *vd; boolean_t try_hard = B_FALSE; ASSERT(!DVA_IS_VALID(&dva[d])); /* * For testing, make some blocks above a certain size be gang blocks. * This will result in more split blocks when using device removal, * and a large number of split blocks coupled with ztest-induced * damage can result in extremely long reconstruction times. This * will also test spilling from special to normal. */ if (psize >= metaslab_force_ganging && metaslab_force_ganging_pct > 0 && (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) { metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, allocator); return (SET_ERROR(ENOSPC)); } /* * Start at the rotor and loop through all mgs until we find something. * Note that there's no locking on mca_rotor or mca_aliquot because * nothing actually breaks if we miss a few updates -- we just won't * allocate quite as evenly. It all balances out over time. * * If we are doing ditto or log blocks, try to spread them across * consecutive vdevs. If we're forced to reuse a vdev before we've * allocated all of our ditto blocks, then try and spread them out on * that vdev as much as possible. If it turns out to not be possible, * gradually lower our standards until anything becomes acceptable. * Also, allocating on consecutive vdevs (as opposed to random vdevs) * gives us hope of containing our fault domains to something we're * able to reason about. Otherwise, any two top-level vdev failures * will guarantee the loss of data. With consecutive allocation, * only two adjacent top-level vdev failures will result in data loss. * * If we are doing gang blocks (hintdva is non-NULL), try to keep * ourselves on the same vdev as our gang block header. That * way, we can hope for locality in vdev_cache, plus it makes our * fault domains something tractable. */ if (hintdva) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); /* * It's possible the vdev we're using as the hint no * longer exists or its mg has been closed (e.g. by * device removal). Consult the rotor when * all else fails. */ if (vd != NULL && vd->vdev_mg != NULL) { mg = vdev_get_mg(vd, mc); if (flags & METASLAB_HINTBP_AVOID) mg = mg->mg_next; } else { mg = mca->mca_rotor; } } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; } else { ASSERT(mca->mca_rotor != NULL); mg = mca->mca_rotor; } /* * If the hint put us into the wrong metaslab class, or into a * metaslab group that has been passivated, just follow the rotor. */ if (mg->mg_class != mc || mg->mg_activation_count <= 0) mg = mca->mca_rotor; rotor = mg; top: do { boolean_t allocatable; ASSERT(mg->mg_activation_count == 1); vd = mg->mg_vd; /* * Don't allocate from faulted devices. */ if (try_hard) { spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); allocatable = vdev_allocatable(vd); spa_config_exit(spa, SCL_ZIO, FTAG); } else { allocatable = vdev_allocatable(vd); } /* * Determine if the selected metaslab group is eligible * for allocations. If we're ganging then don't allow * this metaslab group to skip allocations since that would * inadvertently return ENOSPC and suspend the pool * even though space is still available. */ if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { allocatable = metaslab_group_allocatable(mg, rotor, flags, psize, allocator, d); } if (!allocatable) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_NOT_ALLOCATABLE, allocator); goto next; } ASSERT(mg->mg_initialized); /* * Avoid writing single-copy data to an unhealthy, * non-redundant vdev, unless we've already tried all * other vdevs. */ if (vd->vdev_state < VDEV_STATE_HEALTHY && d == 0 && !try_hard && vd->vdev_children == 0) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_VDEV_ERROR, allocator); goto next; } ASSERT(mg->mg_class == mc); uint64_t asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); /* * If we don't need to try hard, then require that the * block be on a different metaslab from any other DVAs * in this BP (unique=true). If we are trying hard, then * allow any metaslab to be used (unique=false). */ uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, !try_hard, dva, d, allocator, try_hard); if (offset != -1ULL) { /* * If we've just selected this metaslab group, * figure out whether the corresponding vdev is * over- or under-used relative to the pool, * and set an allocation bias to even it out. * * Bias is also used to compensate for unequally * sized vdevs so that space is allocated fairly. */ if (mca->mca_aliquot == 0 && metaslab_bias_enabled) { vdev_stat_t *vs = &vd->vdev_stat; int64_t vs_free = vs->vs_space - vs->vs_alloc; int64_t mc_free = mc->mc_space - mc->mc_alloc; int64_t ratio; /* * Calculate how much more or less we should * try to allocate from this device during * this iteration around the rotor. * * This basically introduces a zero-centered * bias towards the devices with the most * free space, while compensating for vdev * size differences. * * Examples: * vdev V1 = 16M/128M * vdev V2 = 16M/128M * ratio(V1) = 100% ratio(V2) = 100% * * vdev V1 = 16M/128M * vdev V2 = 64M/128M * ratio(V1) = 127% ratio(V2) = 72% * * vdev V1 = 16M/128M * vdev V2 = 64M/512M * ratio(V1) = 40% ratio(V2) = 160% */ ratio = (vs_free * mc->mc_alloc_groups * 100) / (mc_free + 1); mg->mg_bias = ((ratio - 100) * (int64_t)mg->mg_aliquot) / 100; } else if (!metaslab_bias_enabled) { mg->mg_bias = 0; } if ((flags & METASLAB_ZIL) || atomic_add_64_nv(&mca->mca_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { mca->mca_rotor = mg->mg_next; mca->mca_aliquot = 0; } DVA_SET_VDEV(&dva[d], vd->vdev_id); DVA_SET_OFFSET(&dva[d], offset); DVA_SET_GANG(&dva[d], ((flags & METASLAB_GANG_HEADER) ? 1 : 0)); DVA_SET_ASIZE(&dva[d], asize); return (0); } next: mca->mca_rotor = mg->mg_next; mca->mca_aliquot = 0; } while ((mg = mg->mg_next) != rotor); /* * If we haven't tried hard, perhaps do so now. */ if (!try_hard && (zfs_metaslab_try_hard_before_gang || GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 || psize <= 1 << spa->spa_min_ashift)) { METASLABSTAT_BUMP(metaslabstat_try_hard); try_hard = B_TRUE; goto top; } memset(&dva[d], 0, sizeof (dva_t)); metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); return (SET_ERROR(ENOSPC)); } void metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, boolean_t checkpoint) { metaslab_t *msp; spa_t *spa = vd->vdev_spa; ASSERT(vdev_is_concrete(vd)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); metaslab_check_free_impl(vd, offset, asize); mutex_enter(&msp->ms_lock); if (range_tree_is_empty(msp->ms_freeing) && range_tree_is_empty(msp->ms_checkpointing)) { vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); } if (checkpoint) { ASSERT(spa_has_checkpoint(spa)); range_tree_add(msp->ms_checkpointing, offset, asize); } else { range_tree_add(msp->ms_freeing, offset, asize); } mutex_exit(&msp->ms_lock); } void metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { (void) inner_offset; boolean_t *checkpoint = arg; ASSERT3P(checkpoint, !=, NULL); if (vd->vdev_ops->vdev_op_remap != NULL) vdev_indirect_mark_obsolete(vd, offset, size); else metaslab_free_impl(vd, offset, size, *checkpoint); } static void metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, boolean_t checkpoint) { spa_t *spa = vd->vdev_spa; ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) return; if (spa->spa_vdev_removal != NULL && spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && vdev_is_concrete(vd)) { /* * Note: we check if the vdev is concrete because when * we complete the removal, we first change the vdev to be * an indirect vdev (in open context), and then (in syncing * context) clear spa_vdev_removal. */ free_from_removing_vdev(vd, offset, size); } else if (vd->vdev_ops->vdev_op_remap != NULL) { vdev_indirect_mark_obsolete(vd, offset, size); vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_free_impl_cb, &checkpoint); } else { metaslab_free_concrete(vd, offset, size, checkpoint); } } typedef struct remap_blkptr_cb_arg { blkptr_t *rbca_bp; spa_remap_cb_t rbca_cb; vdev_t *rbca_remap_vd; uint64_t rbca_remap_offset; void *rbca_cb_arg; } remap_blkptr_cb_arg_t; static void remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { remap_blkptr_cb_arg_t *rbca = arg; blkptr_t *bp = rbca->rbca_bp; /* We can not remap split blocks. */ if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) return; ASSERT0(inner_offset); if (rbca->rbca_cb != NULL) { /* * At this point we know that we are not handling split * blocks and we invoke the callback on the previous * vdev which must be indirect. */ ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); /* set up remap_blkptr_cb_arg for the next call */ rbca->rbca_remap_vd = vd; rbca->rbca_remap_offset = offset; } /* * The phys birth time is that of dva[0]. This ensures that we know * when each dva was written, so that resilver can determine which * blocks need to be scrubbed (i.e. those written during the time * the vdev was offline). It also ensures that the key used in * the ARC hash table is unique (i.e. dva[0] + phys_birth). If * we didn't change the phys_birth, a lookup in the ARC for a * remapped BP could find the data that was previously stored at * this vdev + offset. */ vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, DVA_GET_VDEV(&bp->blk_dva[0])); vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); DVA_SET_OFFSET(&bp->blk_dva[0], offset); } /* * If the block pointer contains any indirect DVAs, modify them to refer to * concrete DVAs. Note that this will sometimes not be possible, leaving * the indirect DVA in place. This happens if the indirect DVA spans multiple * segments in the mapping (i.e. it is a "split block"). * * If the BP was remapped, calls the callback on the original dva (note the * callback can be called multiple times if the original indirect DVA refers * to another indirect DVA, etc). * * Returns TRUE if the BP was remapped. */ boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) { remap_blkptr_cb_arg_t rbca; if (!zfs_remap_blkptr_enable) return (B_FALSE); if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) return (B_FALSE); /* * Dedup BP's can not be remapped, because ddt_phys_select() depends * on DVA[0] being the same in the BP as in the DDT (dedup table). */ if (BP_GET_DEDUP(bp)) return (B_FALSE); /* * Gang blocks can not be remapped, because * zio_checksum_gang_verifier() depends on the DVA[0] that's in * the BP used to read the gang block header (GBH) being the same * as the DVA[0] that we allocated for the GBH. */ if (BP_IS_GANG(bp)) return (B_FALSE); /* * Embedded BP's have no DVA to remap. */ if (BP_GET_NDVAS(bp) < 1) return (B_FALSE); /* * Note: we only remap dva[0]. If we remapped other dvas, we * would no longer know what their phys birth txg is. */ dva_t *dva = &bp->blk_dva[0]; uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd->vdev_ops->vdev_op_remap == NULL) return (B_FALSE); rbca.rbca_bp = bp; rbca.rbca_cb = callback; rbca.rbca_remap_vd = vd; rbca.rbca_remap_offset = offset; rbca.rbca_cb_arg = arg; /* * remap_blkptr_cb() will be called in order for each level of * indirection, until a concrete vdev is reached or a split block is * encountered. old_vd and old_offset are updated within the callback * as we go from the one indirect vdev to the next one (either concrete * or indirect again) in that order. */ vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); /* Check if the DVA wasn't remapped because it is a split block */ if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) return (B_FALSE); return (B_TRUE); } /* * Undo the allocation of a DVA which happened in the given transaction group. */ void metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) { metaslab_t *msp; vdev_t *vd; uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); ASSERT(DVA_IS_VALID(dva)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (txg > spa_freeze_txg(spa)) return; if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) || (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu", (u_longlong_t)vdev, (u_longlong_t)offset, (u_longlong_t)size); return; } ASSERT(!vd->vdev_removing); ASSERT(vdev_is_concrete(vd)); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); if (DVA_GET_GANG(dva)) size = vdev_gang_header_asize(vd); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); range_tree_remove(msp->ms_allocating[txg & TXG_MASK], offset, size); msp->ms_allocating_total -= size; VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); range_tree_add(msp->ms_allocatable, offset, size); mutex_exit(&msp->ms_lock); } /* * Free the block represented by the given DVA. */ void metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd = vdev_lookup_top(spa, vdev); ASSERT(DVA_IS_VALID(dva)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (DVA_GET_GANG(dva)) { size = vdev_gang_header_asize(vd); } metaslab_free_impl(vd, offset, size, checkpoint); } /* * Reserve some allocation slots. The reservation system must be called * before we call into the allocator. If there aren't any available slots * then the I/O will be throttled until an I/O completes and its slots are * freed up. The function returns true if it was successful in placing * the reservation. */ boolean_t metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio, int flags) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; uint64_t max = mca->mca_alloc_max_slots; ASSERT(mc->mc_alloc_throttle_enabled); if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) || zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) { /* * The potential race between _count() and _add() is covered * by the allocator lock in most cases, or irrelevant due to * GANG_ALLOCATION() or METASLAB_MUST_RESERVE set in others. * But even if we assume some other non-existing scenario, the * worst that can happen is few more I/Os get to allocation * earlier, that is not a problem. * * We reserve the slots individually so that we can unreserve * them individually when an I/O completes. */ zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio); zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; return (B_TRUE); } return (B_FALSE); } void metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio); } static int metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) { metaslab_t *msp; spa_t *spa = vd->vdev_spa; int error = 0; if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) return (SET_ERROR(ENXIO)); ASSERT3P(vd->vdev_ms, !=, NULL); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) { error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); if (error == EBUSY) { ASSERT(msp->ms_loaded); ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); error = 0; } } if (error == 0 && !range_tree_contains(msp->ms_allocatable, offset, size)) error = SET_ERROR(ENOENT); if (error || txg == 0) { /* txg == 0 indicates dry run */ mutex_exit(&msp->ms_lock); return (error); } VERIFY(!msp->ms_condensing); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, msp->ms_size); range_tree_remove(msp->ms_allocatable, offset, size); range_tree_clear(msp->ms_trim, offset, size); if (spa_writeable(spa)) { /* don't dirty if we're zdb(8) */ metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (!multilist_link_active(&msp->ms_class_txg_node)) { msp->ms_selected_txg = txg; multilist_sublist_insert_head(mls, msp); } multilist_sublist_unlock(mls); if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(vd, VDD_METASLAB, msp, txg); range_tree_add(msp->ms_allocating[txg & TXG_MASK], offset, size); msp->ms_allocating_total += size; } mutex_exit(&msp->ms_lock); return (0); } typedef struct metaslab_claim_cb_arg_t { uint64_t mcca_txg; int mcca_error; } metaslab_claim_cb_arg_t; static void metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { (void) inner_offset; metaslab_claim_cb_arg_t *mcca_arg = arg; if (mcca_arg->mcca_error == 0) { mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, size, mcca_arg->mcca_txg); } } int metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) { if (vd->vdev_ops->vdev_op_remap != NULL) { metaslab_claim_cb_arg_t arg; /* * Only zdb(8) can claim on indirect vdevs. This is used * to detect leaks of mapped space (that are not accounted * for in the obsolete counts, spacemap, or bpobj). */ ASSERT(!spa_writeable(vd->vdev_spa)); arg.mcca_error = 0; arg.mcca_txg = txg; vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_claim_impl_cb, &arg); if (arg.mcca_error == 0) { arg.mcca_error = metaslab_claim_concrete(vd, offset, size, txg); } return (arg.mcca_error); } else { return (metaslab_claim_concrete(vd, offset, size, txg)); } } /* * Intent log support: upon opening the pool after a crash, notify the SPA * of blocks that the intent log has allocated for immediate write, but * which are still considered free by the SPA because the last transaction * group didn't commit yet. */ static int metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd; if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { return (SET_ERROR(ENXIO)); } ASSERT(DVA_IS_VALID(dva)); if (DVA_GET_GANG(dva)) size = vdev_gang_header_asize(vd); return (metaslab_claim_impl(vd, offset, size, txg)); } int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_alloc_list_t *zal, zio_t *zio, int allocator) { dva_t *dva = bp->blk_dva; dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; int error = 0; ASSERT(bp->blk_birth == 0); ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); if (mc->mc_allocator[allocator].mca_rotor == NULL) { /* no vdevs in this class */ spa_config_exit(spa, SCL_ALLOC, FTAG); return (SET_ERROR(ENOSPC)); } ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); ASSERT(BP_GET_NDVAS(bp) == 0); ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); ASSERT3P(zal, !=, NULL); for (int d = 0; d < ndvas; d++) { error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, txg, flags, zal, allocator); if (error != 0) { for (d--; d >= 0; d--) { metaslab_unalloc_dva(spa, &dva[d], txg); metaslab_group_alloc_decrement(spa, DVA_GET_VDEV(&dva[d]), zio, flags, allocator, B_FALSE); memset(&dva[d], 0, sizeof (dva_t)); } spa_config_exit(spa, SCL_ALLOC, FTAG); return (error); } else { /* * Update the metaslab group's queue depth * based on the newly allocated dva. */ metaslab_group_alloc_increment(spa, DVA_GET_VDEV(&dva[d]), zio, flags, allocator); } } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); spa_config_exit(spa, SCL_ALLOC, FTAG); BP_SET_BIRTH(bp, txg, 0); return (0); } void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); ASSERT(!BP_IS_HOLE(bp)); ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); /* * If we have a checkpoint for the pool we need to make sure that * the blocks that we free that are part of the checkpoint won't be * reused until the checkpoint is discarded or we revert to it. * * The checkpoint flag is passed down the metaslab_free code path * and is set whenever we want to add a block to the checkpoint's * accounting. That is, we "checkpoint" blocks that existed at the * time the checkpoint was created and are therefore referenced by * the checkpointed uberblock. * * Note that, we don't checkpoint any blocks if the current * syncing txg <= spa_checkpoint_txg. We want these frees to sync * normally as they will be referenced by the checkpointed uberblock. */ boolean_t checkpoint = B_FALSE; if (bp->blk_birth <= spa->spa_checkpoint_txg && spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { /* * At this point, if the block is part of the checkpoint * there is no way it was created in the current txg. */ ASSERT(!now); ASSERT3U(spa_syncing_txg(spa), ==, txg); checkpoint = B_TRUE; } spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); for (int d = 0; d < ndvas; d++) { if (now) { metaslab_unalloc_dva(spa, &dva[d], txg); } else { ASSERT3U(txg, ==, spa_syncing_txg(spa)); metaslab_free_dva(spa, &dva[d], checkpoint); } } spa_config_exit(spa, SCL_FREE, FTAG); } int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); int error = 0; ASSERT(!BP_IS_HOLE(bp)); if (txg != 0) { /* * First do a dry run to make sure all DVAs are claimable, * so we don't have to unwind from partial failures below. */ if ((error = metaslab_claim(spa, bp, 0)) != 0) return (error); } spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); for (int d = 0; d < ndvas; d++) { error = metaslab_claim_dva(spa, &dva[d], txg); if (error != 0) break; } spa_config_exit(spa, SCL_ALLOC, FTAG); ASSERT(error == 0 || txg == 0); return (error); } static void metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { (void) inner, (void) arg; if (vd->vdev_ops == &vdev_indirect_ops) return; metaslab_check_free_impl(vd, offset, size); } static void metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) { metaslab_t *msp; spa_t *spa __maybe_unused = vd->vdev_spa; if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) return; if (vd->vdev_ops->vdev_op_remap != NULL) { vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_check_free_impl_cb, NULL); return; } ASSERT(vdev_is_concrete(vd)); ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); if (msp->ms_loaded) { range_tree_verify_not_present(msp->ms_allocatable, offset, size); } /* * Check all segments that currently exist in the freeing pipeline. * * It would intuitively make sense to also check the current allocating * tree since metaslab_unalloc_dva() exists for extents that are * allocated and freed in the same sync pass within the same txg. * Unfortunately there are places (e.g. the ZIL) where we allocate a * segment but then we free part of it within the same txg * [see zil_sync()]. Thus, we don't call range_tree_verify() in the * current allocating tree. */ range_tree_verify_not_present(msp->ms_freeing, offset, size); range_tree_verify_not_present(msp->ms_checkpointing, offset, size); range_tree_verify_not_present(msp->ms_freed, offset, size); for (int j = 0; j < TXG_DEFER_SIZE; j++) range_tree_verify_not_present(msp->ms_defer[j], offset, size); range_tree_verify_not_present(msp->ms_trim, offset, size); mutex_exit(&msp->ms_lock); } void metaslab_check_free(spa_t *spa, const blkptr_t *bp) { if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) return; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int i = 0; i < BP_GET_NDVAS(bp); i++) { uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); vdev_t *vd = vdev_lookup_top(spa, vdev); uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); if (DVA_GET_GANG(&bp->blk_dva[i])) size = vdev_gang_header_asize(vd); ASSERT3P(vd, !=, NULL); metaslab_check_free_impl(vd, offset, size); } spa_config_exit(spa, SCL_VDEV, FTAG); } static void metaslab_group_disable_wait(metaslab_group_t *mg) { ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); while (mg->mg_disabled_updating) { cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); } } static void metaslab_group_disabled_increment(metaslab_group_t *mg) { ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); ASSERT(mg->mg_disabled_updating); while (mg->mg_ms_disabled >= max_disabled_ms) { cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); } mg->mg_ms_disabled++; ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms); } /* * Mark the metaslab as disabled to prevent any allocations on this metaslab. * We must also track how many metaslabs are currently disabled within a * metaslab group and limit them to prevent allocation failures from * occurring because all metaslabs are disabled. */ void metaslab_disable(metaslab_t *msp) { ASSERT(!MUTEX_HELD(&msp->ms_lock)); metaslab_group_t *mg = msp->ms_group; mutex_enter(&mg->mg_ms_disabled_lock); /* * To keep an accurate count of how many threads have disabled * a specific metaslab group, we only allow one thread to mark * the metaslab group at a time. This ensures that the value of * ms_disabled will be accurate when we decide to mark a metaslab * group as disabled. To do this we force all other threads * to wait till the metaslab's mg_disabled_updating flag is no * longer set. */ metaslab_group_disable_wait(mg); mg->mg_disabled_updating = B_TRUE; if (msp->ms_disabled == 0) { metaslab_group_disabled_increment(mg); } mutex_enter(&msp->ms_lock); msp->ms_disabled++; mutex_exit(&msp->ms_lock); mg->mg_disabled_updating = B_FALSE; cv_broadcast(&mg->mg_ms_disabled_cv); mutex_exit(&mg->mg_ms_disabled_lock); } void metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload) { metaslab_group_t *mg = msp->ms_group; spa_t *spa = mg->mg_vd->vdev_spa; /* * Wait for the outstanding IO to be synced to prevent newly * allocated blocks from being overwritten. This used by * initialize and TRIM which are modifying unallocated space. */ if (sync) txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&mg->mg_ms_disabled_lock); mutex_enter(&msp->ms_lock); if (--msp->ms_disabled == 0) { mg->mg_ms_disabled--; cv_broadcast(&mg->mg_ms_disabled_cv); if (unload) metaslab_unload(msp); } mutex_exit(&msp->ms_lock); mutex_exit(&mg->mg_ms_disabled_lock); } void metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty) { ms->ms_unflushed_dirty = dirty; } static void metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) { vdev_t *vd = ms->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); metaslab_unflushed_phys_t entry = { .msp_unflushed_txg = metaslab_unflushed_txg(ms), }; uint64_t entry_size = sizeof (entry); uint64_t entry_offset = ms->ms_id * entry_size; uint64_t object = 0; int err = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); if (err == ENOENT) { object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object, tx)); } else { VERIFY0(err); } dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size, &entry, tx); } void metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) { ms->ms_unflushed_txg = txg; metaslab_update_ondisk_flush_data(ms, tx); } boolean_t metaslab_unflushed_dirty(metaslab_t *ms) { return (ms->ms_unflushed_dirty); } uint64_t metaslab_unflushed_txg(metaslab_t *ms) { return (ms->ms_unflushed_txg); } ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, U64, ZMOD_RW, "Allocation granularity (a.k.a. stripe size)"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW, "Load all metaslabs when pool is first opened"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW, "Prevent metaslabs from being unloaded"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW, "Preload potential metaslabs during reassessment"); +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW, + "Max number of metaslabs per group to preload"); + ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW, "Delay in txgs after metaslab was last used before unloading"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, UINT, ZMOD_RW, "Delay in milliseconds after metaslab was last used before unloading"); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, UINT, ZMOD_RW, "Percentage of metaslab group size that should be free to make it " "eligible for allocation"); ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, UINT, ZMOD_RW, "Percentage of metaslab group size that should be considered eligible " "for allocations unless all metaslab groups within the metaslab class " "have also crossed this threshold"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT, ZMOD_RW, "Use the fragmentation metric to prefer less fragmented metaslabs"); /* END CSTYLED */ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, UINT, ZMOD_RW, "Fragmentation for metaslab to allow allocation"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW, "Prefer metaslabs with lower LBAs"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW, "Enable metaslab group biasing"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT, ZMOD_RW, "Enable segment-based metaslab selection"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW, "Segment-based metaslab selection maximum buckets before switching"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW, "Blocks larger than this size are sometimes forced to be gang blocks"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW, "Percentage of large blocks that will be forced to be gang blocks"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW, "Max distance (bytes) to search forward before using size tree"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW, "When looking in size tree, use largest segment instead of exact fit"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64, ZMOD_RW, "How long to trust the cached max chunk size of a metaslab"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, UINT, ZMOD_RW, "Percentage of memory that can be used to store metaslab range trees"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT, ZMOD_RW, "Try hard to allocate before ganging"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW, "Normally only consider this many of the best metaslabs in each vdev"); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator, param_set_active_allocator, param_get_charp, ZMOD_RW, "SPA active allocator"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index cda62f939c1e..0b69568e2aad 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -1,10183 +1,10215 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright 2018 Joyent, Inc. * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2021, Colm Buckley * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. */ /* * SPA: Storage Pool Allocator * * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a * pool. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #include #endif /* _KERNEL */ #include "zfs_prop.h" #include "zfs_comutil.h" /* * The interval, in seconds, at which failed configuration cache file writes * should be retried. */ int zfs_ccw_retry_interval = 300; typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ ZTI_MODE_NULL, /* don't create a taskq */ ZTI_NMODES } zti_modes_t; #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } #define ZTI_N(n) ZTI_P(n, 1) #define ZTI_ONE ZTI_N(1) typedef struct zio_taskq_info { zti_modes_t zti_mode; uint_t zti_value; uint_t zti_count; } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { "iss", "iss_h", "int", "int_h" }; /* * This table defines the taskq settings for each ZFS I/O type. When * initializing a pool, we use this table to create an appropriately sized * taskq. Some operations are low volume and therefore have a small, static * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE * macros. Other operations process a large amount of data; the ZTI_BATCH * macro causes us to create a taskq oriented for throughput. Some operations * are so high frequency and short-lived that the taskq itself can become a * point of lock contention. The ZTI_P(#, #) macro indicates that we need an * additional degree of parallelism specified by the number of threads per- * taskq and the number of taskqs; when dispatching an event in this case, the * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH, * but with number of taskqs also scaling with number of CPUs. * * The different taskq priorities are to handle the different contexts (issue * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that * need to be handled with minimum delay. */ static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ }; static void spa_sync_version(void *arg, dmu_tx_t *tx); static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport); static void spa_vdev_resilver_done(spa_t *spa); +/* + * Percentage of all CPUs that can be used by the metaslab preload taskq. + */ +static uint_t metaslab_preload_pct = 50; + static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ static uint_t zio_taskq_batch_tpq; /* threads per taskq */ static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ /* * Report any spa_load_verify errors found, but do not fail spa_load. * This is used by zdb to analyze non-idle pools. */ boolean_t spa_load_verify_dryrun = B_FALSE; /* * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ). * This is used by zdb for spacemaps verification. */ boolean_t spa_mode_readable_spacemaps = B_FALSE; /* * This (illegal) pool name is used when temporarily importing a spa_t in order * to get the vdev stats associated with the imported devices. */ #define TRYIMPORT_NAME "$import" /* * For debugging purposes: print out vdev tree during pool import. */ static int spa_load_print_vdev_tree = B_FALSE; /* * A non-zero value for zfs_max_missing_tvds means that we allow importing * pools with missing top-level vdevs. This is strictly intended for advanced * pool recovery cases since missing data is almost inevitable. Pools with * missing devices can only be imported read-only for safety reasons, and their * fail-mode will be automatically set to "continue". * * With 1 missing vdev we should be able to import the pool and mount all * datasets. User data that was not modified after the missing device has been * added should be recoverable. This means that snapshots created prior to the * addition of that device should be completely intact. * * With 2 missing vdevs, some datasets may fail to mount since there are * dataset statistics that are stored as regular metadata. Some data might be * recoverable if those vdevs were added recently. * * With 3 or more missing vdevs, the pool is severely damaged and MOS entries * may be missing entirely. Chances of data recovery are very low. Note that * there are also risks of performing an inadvertent rewind as we might be * missing all the vdevs with the latest uberblocks. */ uint64_t zfs_max_missing_tvds = 0; /* * The parameters below are similar to zfs_max_missing_tvds but are only * intended for a preliminary open of the pool with an untrusted config which * might be incomplete or out-dated. * * We are more tolerant for pools opened from a cachefile since we could have * an out-dated cachefile where a device removal was not registered. * We could have set the limit arbitrarily high but in the case where devices * are really missing we would want to return the proper error codes; we chose * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available * and we get a chance to retrieve the trusted config. */ uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; /* * In the case where config was assembled by scanning device paths (/dev/dsks * by default) we are less tolerant since all the existing devices should have * been detected and we want spa_load to return the right error codes. */ uint64_t zfs_max_missing_tvds_scan = 0; /* * Debugging aid that pauses spa_sync() towards the end. */ static const boolean_t zfs_pause_spa_sync = B_FALSE; /* * Variables to indicate the livelist condense zthr func should wait at certain * points for the livelist to be removed - used to test condense/destroy races */ static int zfs_livelist_condense_zthr_pause = 0; static int zfs_livelist_condense_sync_pause = 0; /* * Variables to track whether or not condense cancellation has been * triggered in testing. */ static int zfs_livelist_condense_sync_cancel = 0; static int zfs_livelist_condense_zthr_cancel = 0; /* * Variable to track whether or not extra ALLOC blkptrs were added to a * livelist entry while it was being condensed (caused by the way we track * remapped blkptrs in dbuf_remap_impl) */ static int zfs_livelist_condense_new_alloc = 0; /* * ========================================================================== * SPA properties routines * ========================================================================== */ /* * Add a (source=src, propname=propval) list to an nvlist. */ static void spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, uint64_t intval, zprop_source_t src) { const char *propname = zpool_prop_to_name(prop); nvlist_t *propval; propval = fnvlist_alloc(); fnvlist_add_uint64(propval, ZPROP_SOURCE, src); if (strval != NULL) fnvlist_add_string(propval, ZPROP_VALUE, strval); else fnvlist_add_uint64(propval, ZPROP_VALUE, intval); fnvlist_add_nvlist(nvl, propname, propval); nvlist_free(propval); } /* * Add a user property (source=src, propname=propval) to an nvlist. */ static void spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval, zprop_source_t src) { nvlist_t *propval; VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); nvlist_free(propval); } /* * Get property values from the spa configuration. */ static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { vdev_t *rvd = spa->spa_root_vdev; dsl_pool_t *pool = spa->spa_dsl_pool; uint64_t size, alloc, cap, version; const zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; metaslab_class_t *mc = spa_normal_class(spa); ASSERT(MUTEX_HELD(&spa->spa_props_lock)); if (rvd != NULL) { alloc = metaslab_class_get_alloc(mc); alloc += metaslab_class_get_alloc(spa_special_class(spa)); alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); size = metaslab_class_get_space(mc); size += metaslab_class_get_space(spa_special_class(spa)); size += metaslab_class_get_space(spa_dedup_class(spa)); size += metaslab_class_get_space(spa_embedded_log_class(spa)); spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, size - alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, spa->spa_checkpoint_info.sci_dspace, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, metaslab_class_fragmentation(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, metaslab_class_expandable_space(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, (spa_mode(spa) == SPA_MODE_READ), src); cap = (size == 0) ? 0 : (alloc * 100 / size); spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, ddt_get_pool_dedup_ratio(spa), src); spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL, brt_get_used(spa), src); spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL, brt_get_saved(spa), src); spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL, brt_get_ratio(spa), src); spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, rvd->vdev_state, src); version = spa_version(spa); if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, ZPROP_SRC_DEFAULT); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, ZPROP_SRC_LOCAL); } spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, NULL, spa_load_guid(spa), src); } if (pool != NULL) { /* * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, * when opening pools before this version freedir will be NULL. */ if (pool->dp_free_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 0, src); } if (pool->dp_leak_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 0, src); } } spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); if (spa->spa_comment != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 0, ZPROP_SRC_LOCAL); } if (spa->spa_compatibility != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY, spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); } if (spa->spa_root != NULL) spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 0, ZPROP_SRC_LOCAL); if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); } if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, DNODE_MAX_SIZE, ZPROP_SRC_NONE); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, DNODE_MIN_SIZE, ZPROP_SRC_NONE); } if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, "none", 0, ZPROP_SRC_LOCAL); } else if (strcmp(dp->scd_path, spa_config_path) != 0) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, dp->scd_path, 0, ZPROP_SRC_LOCAL); } } } /* * Get zpool property values. */ int spa_prop_get(spa_t *spa, nvlist_t **nvp) { objset_t *mos = spa->spa_meta_objset; zap_cursor_t zc; zap_attribute_t za; dsl_pool_t *dp; int err; err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); if (err) return (err); dp = spa_get_dsl(spa); dsl_pool_config_enter(dp, FTAG); mutex_enter(&spa->spa_props_lock); /* * Get properties from the spa config. */ spa_prop_get_config(spa, nvp); /* If no pool property object, no more prop to get. */ if (mos == NULL || spa->spa_pool_props_object == 0) goto out; /* * Get properties from the MOS pool property object. */ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); (err = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t intval = 0; char *strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; zpool_prop_t prop; if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL && !zfs_prop_user(za.za_name)) continue; switch (za.za_integer_length) { case 8: /* integer property */ if (za.za_first_integer != zpool_prop_default_numeric(prop)) src = ZPROP_SRC_LOCAL; if (prop == ZPOOL_PROP_BOOTFS) { dsl_dataset_t *ds = NULL; err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &ds); if (err != 0) break; strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, strval); dsl_dataset_rele(ds, FTAG); } else { strval = NULL; intval = za.za_first_integer; } spa_prop_add_list(*nvp, prop, strval, intval, src); if (strval != NULL) kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); break; case 1: /* string property */ strval = kmem_alloc(za.za_num_integers, KM_SLEEP); err = zap_lookup(mos, spa->spa_pool_props_object, za.za_name, 1, za.za_num_integers, strval); if (err) { kmem_free(strval, za.za_num_integers); break; } if (prop != ZPOOL_PROP_INVAL) { spa_prop_add_list(*nvp, prop, strval, 0, src); } else { src = ZPROP_SRC_LOCAL; spa_prop_add_user(*nvp, za.za_name, strval, src); } kmem_free(strval, za.za_num_integers); break; default: break; } } zap_cursor_fini(&zc); out: mutex_exit(&spa->spa_props_lock); dsl_pool_config_exit(dp, FTAG); if (err && err != ENOENT) { nvlist_free(*nvp); *nvp = NULL; return (err); } return (0); } /* * Validate the given pool properties nvlist and modify the list * for the property values to be set. */ static int spa_prop_validate(spa_t *spa, nvlist_t *props) { nvpair_t *elem; int error = 0, reset_bootfs = 0; uint64_t objnum = 0; boolean_t has_feature = B_FALSE; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { uint64_t intval; const char *strval, *slash, *check, *fname; const char *propname = nvpair_name(elem); zpool_prop_t prop = zpool_name_to_prop(propname); switch (prop) { case ZPOOL_PROP_INVAL: /* * Sanitize the input. */ if (zfs_prop_user(propname)) { if (strlen(propname) >= ZAP_MAXNAMELEN) { error = SET_ERROR(ENAMETOOLONG); break; } if (strlen(fnvpair_value_string(elem)) >= ZAP_MAXVALUELEN) { error = SET_ERROR(E2BIG); break; } } else if (zpool_prop_feature(propname)) { if (nvpair_type(elem) != DATA_TYPE_UINT64) { error = SET_ERROR(EINVAL); break; } if (nvpair_value_uint64(elem, &intval) != 0) { error = SET_ERROR(EINVAL); break; } if (intval != 0) { error = SET_ERROR(EINVAL); break; } fname = strchr(propname, '@') + 1; if (zfeature_lookup_name(fname, NULL) != 0) { error = SET_ERROR(EINVAL); break; } has_feature = B_TRUE; } else { error = SET_ERROR(EINVAL); break; } break; case ZPOOL_PROP_VERSION: error = nvpair_value_uint64(elem, &intval); if (!error && (intval < spa_version(spa) || intval > SPA_VERSION_BEFORE_FEATURES || has_feature)) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_DELEGATION: case ZPOOL_PROP_AUTOREPLACE: case ZPOOL_PROP_LISTSNAPS: case ZPOOL_PROP_AUTOEXPAND: case ZPOOL_PROP_AUTOTRIM: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_MULTIHOST: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = SET_ERROR(EINVAL); if (!error) { uint32_t hostid = zone_get_hostid(NULL); if (hostid) spa->spa_hostid = hostid; else error = SET_ERROR(ENOTSUP); } break; case ZPOOL_PROP_BOOTFS: /* * If the pool version is less than SPA_VERSION_BOOTFS, * or the pool is still being created (version == 0), * the bootfs property cannot be set. */ if (spa_version(spa) < SPA_VERSION_BOOTFS) { error = SET_ERROR(ENOTSUP); break; } /* * Make sure the vdev config is bootable */ if (!vdev_is_bootable(spa->spa_root_vdev)) { error = SET_ERROR(ENOTSUP); break; } reset_bootfs = 1; error = nvpair_value_string(elem, &strval); if (!error) { objset_t *os; if (strval == NULL || strval[0] == '\0') { objnum = zpool_prop_default_numeric( ZPOOL_PROP_BOOTFS); break; } error = dmu_objset_hold(strval, FTAG, &os); if (error != 0) break; /* Must be ZPL. */ if (dmu_objset_type(os) != DMU_OST_ZFS) { error = SET_ERROR(ENOTSUP); } else { objnum = dmu_objset_id(os); } dmu_objset_rele(os, FTAG); } break; case ZPOOL_PROP_FAILUREMODE: error = nvpair_value_uint64(elem, &intval); if (!error && intval > ZIO_FAILURE_MODE_PANIC) error = SET_ERROR(EINVAL); /* * This is a special case which only occurs when * the pool has completely failed. This allows * the user to change the in-core failmode property * without syncing it out to disk (I/Os might * currently be blocked). We do this by returning * EIO to the caller (spa_prop_set) to trick it * into thinking we encountered a property validation * error. */ if (!error && spa_suspended(spa)) { spa->spa_failmode = intval; error = SET_ERROR(EIO); } break; case ZPOOL_PROP_CACHEFILE: if ((error = nvpair_value_string(elem, &strval)) != 0) break; if (strval[0] == '\0') break; if (strcmp(strval, "none") == 0) break; if (strval[0] != '/') { error = SET_ERROR(EINVAL); break; } slash = strrchr(strval, '/'); ASSERT(slash != NULL); if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || strcmp(slash, "/..") == 0) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_COMMENT: if ((error = nvpair_value_string(elem, &strval)) != 0) break; for (check = strval; *check != '\0'; check++) { if (!isprint(*check)) { error = SET_ERROR(EINVAL); break; } } if (strlen(strval) > ZPROP_MAX_COMMENT) error = SET_ERROR(E2BIG); break; default: break; } if (error) break; } (void) nvlist_remove_all(props, zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); if (!error && reset_bootfs) { error = nvlist_remove(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); if (!error) { error = nvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); } } return (error); } void spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) { const char *cachefile; spa_config_dirent_t *dp; if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), &cachefile) != 0) return; dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP); if (cachefile[0] == '\0') dp->scd_path = spa_strdup(spa_config_path); else if (strcmp(cachefile, "none") == 0) dp->scd_path = NULL; else dp->scd_path = spa_strdup(cachefile); list_insert_head(&spa->spa_config_list, dp); if (need_sync) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; nvpair_t *elem = NULL; boolean_t need_sync = B_FALSE; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT || prop == ZPOOL_PROP_READONLY) continue; if (prop == ZPOOL_PROP_INVAL && zfs_prop_user(nvpair_name(elem))) { need_sync = B_TRUE; break; } if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { uint64_t ver = 0; if (prop == ZPOOL_PROP_VERSION) { VERIFY(nvpair_value_uint64(elem, &ver) == 0); } else { ASSERT(zpool_prop_feature(nvpair_name(elem))); ver = SPA_VERSION_FEATURES; need_sync = B_TRUE; } /* Save time if the version is already set. */ if (ver == spa_version(spa)) continue; /* * In addition to the pool directory object, we might * create the pool properties object, the features for * read object, the features for write object, or the * feature descriptions object. */ error = dsl_sync_task(spa->spa_name, NULL, spa_sync_version, &ver, 6, ZFS_SPACE_CHECK_RESERVED); if (error) return (error); continue; } need_sync = B_TRUE; break; } if (need_sync) { return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, nvp, 6, ZFS_SPACE_CHECK_RESERVED)); } return (0); } /* * If the bootfs property value is dsobj, clear it. */ void spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) { if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { VERIFY(zap_remove(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); spa->spa_bootfs = 0; } } static int spa_change_guid_check(void *arg, dmu_tx_t *tx) { uint64_t *newguid __maybe_unused = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t vdev_state; if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { int error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (SET_ERROR(error)); } spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_state = rvd->vdev_state; spa_config_exit(spa, SCL_STATE, FTAG); if (vdev_state != VDEV_STATE_HEALTHY) return (SET_ERROR(ENXIO)); ASSERT3U(spa_guid(spa), !=, *newguid); return (0); } static void spa_change_guid_sync(void *arg, dmu_tx_t *tx) { uint64_t *newguid = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; uint64_t oldguid; vdev_t *rvd = spa->spa_root_vdev; oldguid = spa_guid(spa); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); rvd->vdev_guid = *newguid; rvd->vdev_guid_sum += (*newguid - oldguid); vdev_config_dirty(rvd); spa_config_exit(spa, SCL_STATE, FTAG); spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", (u_longlong_t)oldguid, (u_longlong_t)*newguid); } /* * Change the GUID for the pool. This is done so that we can later * re-import a pool built from a clone of our own vdevs. We will modify * the root vdev's guid, our own pool guid, and then mark all of our * vdevs dirty. Note that we must make sure that all our vdevs are * online when we do this, or else any vdevs that weren't present * would be orphaned from our pool. We are also going to issue a * sysevent to update any watchers. */ int spa_change_guid(spa_t *spa) { int error; uint64_t guid; mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); guid = spa_generate_guid(NULL); error = dsl_sync_task(spa->spa_name, spa_change_guid_check, spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); if (error == 0) { /* * Clear the kobj flag from all the vdevs to allow * vdev_cache_process_kobj_evt() to post events to all the * vdevs since GUID is updated. */ vdev_clear_kobj_evt(spa->spa_root_vdev); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); } mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * ========================================================================== * SPA state manipulation (open/create/destroy/import/export) * ========================================================================== */ static int spa_error_entry_compare(const void *a, const void *b) { const spa_error_entry_t *sa = (const spa_error_entry_t *)a; const spa_error_entry_t *sb = (const spa_error_entry_t *)b; int ret; ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_phys_t)); return (TREE_ISIGN(ret)); } /* * Utility function which retrieves copies of the current logs and * re-initializes them in the process. */ void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) { ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t)); memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); } static void spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; enum zti_modes mode = ztip->zti_mode; uint_t value = ztip->zti_value; uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t cpus, flags = TASKQ_DYNAMIC; boolean_t batch = B_FALSE; switch (mode) { case ZTI_MODE_FIXED: ASSERT3U(value, >, 0); break; case ZTI_MODE_BATCH: batch = B_TRUE; flags |= TASKQ_THREADS_CPU_PCT; value = MIN(zio_taskq_batch_pct, 100); break; case ZTI_MODE_SCALE: flags |= TASKQ_THREADS_CPU_PCT; /* * We want more taskqs to reduce lock contention, but we want * less for better request ordering and CPU utilization. */ cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); if (zio_taskq_batch_tpq > 0) { count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) / zio_taskq_batch_tpq); } else { /* * Prefer 6 threads per taskq, but no more taskqs * than threads in them on large systems. For 80%: * * taskq taskq total * cpus taskqs percent threads threads * ------- ------- ------- ------- ------- * 1 1 80% 1 1 * 2 1 80% 1 1 * 4 1 80% 3 3 * 8 2 40% 3 6 * 16 3 27% 4 12 * 32 5 16% 5 25 * 64 7 11% 7 49 * 128 10 8% 10 100 * 256 14 6% 15 210 */ count = 1 + cpus / 6; while (count * count > cpus) count--; } /* Limit each taskq within 100% to not trigger assertion. */ count = MAX(count, (zio_taskq_batch_pct + 99) / 100); value = (zio_taskq_batch_pct + count / 2) / count; break; case ZTI_MODE_NULL: tqs->stqs_count = 0; tqs->stqs_taskq = NULL; return; default: panic("unrecognized mode for %s_%s taskq (%u:%u) in " "spa_activate()", zio_type_name[t], zio_taskq_types[q], mode, value); break; } ASSERT3U(count, >, 0); tqs->stqs_count = count; tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); for (uint_t i = 0; i < count; i++) { taskq_t *tq; char name[32]; if (count > 1) (void) snprintf(name, sizeof (name), "%s_%s_%u", zio_type_name[t], zio_taskq_types[q], i); else (void) snprintf(name, sizeof (name), "%s_%s", zio_type_name[t], zio_taskq_types[q]); if (zio_taskq_sysdc && spa->spa_proc != &p0) { if (batch) flags |= TASKQ_DC_BATCH; (void) zio_taskq_basedc; tq = taskq_create_sysdc(name, value, 50, INT_MAX, spa->spa_proc, zio_taskq_basedc, flags); } else { pri_t pri = maxclsyspri; /* * The write issue taskq can be extremely CPU * intensive. Run it at slightly less important * priority than the other taskqs. * * Under Linux and FreeBSD this means incrementing * the priority value as opposed to platforms like * illumos where it should be decremented. * * On FreeBSD, if priorities divided by four (RQ_PPQ) * are equal then a difference between them is * insignificant. */ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) { #if defined(__linux__) pri++; #elif defined(__FreeBSD__) pri += 4; #else #error "unknown OS" #endif } tq = taskq_create_proc(name, value, pri, 50, INT_MAX, spa->spa_proc, flags); } tqs->stqs_taskq[i] = tq; } } static void spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; if (tqs->stqs_taskq == NULL) { ASSERT3U(tqs->stqs_count, ==, 0); return; } for (uint_t i = 0; i < tqs->stqs_count; i++) { ASSERT3P(tqs->stqs_taskq[i], !=, NULL); taskq_destroy(tqs->stqs_taskq[i]); } kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); tqs->stqs_taskq = NULL; } /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention * on the taskq itself. In that case we choose which taskq at random by using * the low bits of gethrtime(). */ void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } taskq_dispatch_ent(tq, func, arg, flags, ent); } /* * Same as spa_taskq_dispatch_ent() but block on the task until completion. */ void spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; taskqid_t id; ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } id = taskq_dispatch(tq, func, arg, flags); if (id) taskq_wait_id(tq, id); } static void spa_create_zio_taskqs(spa_t *spa) { for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_init(spa, t, q); } } } /* * Disabled until spa_thread() can be adapted for Linux. */ #undef HAVE_SPA_THREAD #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) static void spa_thread(void *arg) { psetid_t zio_taskq_psrset_bind = PS_NONE; callb_cpr_t cprinfo; spa_t *spa = arg; user_t *pu = PTOU(curproc); CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, spa->spa_name); ASSERT(curproc != &p0); (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), "zpool-%s", spa->spa_name); (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); /* bind this thread to the requested psrset */ if (zio_taskq_psrset_bind != PS_NONE) { pool_lock(); mutex_enter(&cpu_lock); mutex_enter(&pidlock); mutex_enter(&curproc->p_lock); if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 0, NULL, NULL) == 0) { curthread->t_bind_pset = zio_taskq_psrset_bind; } else { cmn_err(CE_WARN, "Couldn't bind process for zfs pool \"%s\" to " "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); } mutex_exit(&curproc->p_lock); mutex_exit(&pidlock); mutex_exit(&cpu_lock); pool_unlock(); } if (zio_taskq_sysdc) { sysdc_thread_enter(curthread, 100, 0); } spa->spa_proc = curproc; spa->spa_did = curthread->t_did; spa_create_zio_taskqs(spa); mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); spa->spa_proc_state = SPA_PROC_ACTIVE; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_SAFE_BEGIN(&cprinfo); while (spa->spa_proc_state == SPA_PROC_ACTIVE) cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); spa->spa_proc_state = SPA_PROC_GONE; spa->spa_proc = &p0; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ mutex_enter(&curproc->p_lock); lwp_exit(); } #endif extern metaslab_ops_t *metaslab_allocator(spa_t *spa); /* * Activate an uninitialized pool. */ static void spa_activate(spa_t *spa, spa_mode_t mode) { metaslab_ops_t *msp = metaslab_allocator(spa); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; spa->spa_mode = mode; spa->spa_read_spacemaps = spa_mode_readable_spacemaps; spa->spa_normal_class = metaslab_class_create(spa, msp); spa->spa_log_class = metaslab_class_create(spa, msp); spa->spa_embedded_log_class = metaslab_class_create(spa, msp); spa->spa_special_class = metaslab_class_create(spa, msp); spa->spa_dedup_class = metaslab_class_create(spa, msp); /* Try to create a covering process */ mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_NONE); ASSERT(spa->spa_proc == &p0); spa->spa_did = 0; (void) spa_create_process; #ifdef HAVE_SPA_THREAD /* Only create a process if we're going to be around a while. */ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, NULL, 0) == 0) { spa->spa_proc_state = SPA_PROC_CREATED; while (spa->spa_proc_state == SPA_PROC_CREATED) { cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); ASSERT(spa->spa_proc != &p0); ASSERT(spa->spa_did != 0); } else { #ifdef _KERNEL cmn_err(CE_WARN, "Couldn't create process for zfs pool \"%s\"\n", spa->spa_name); #endif } } #endif /* HAVE_SPA_THREAD */ mutex_exit(&spa->spa_proc_lock); /* If we didn't create a process, we need to create our taskqs. */ if (spa->spa_proc == &p0) { spa_create_zio_taskqs(spa); } for (size_t i = 0; i < TXG_SIZE; i++) { spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); } list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_evicting_os_list, sizeof (objset_t), offsetof(objset_t, os_evicting_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_state_dirty_node)); txg_list_create(&spa->spa_vdev_txg_list, spa, offsetof(struct vdev, vdev_txg_node)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_healed, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); spa_activate_os(spa); spa_keystore_init(&spa->spa_keystore); /* * This taskq is used to perform zvol-minor-related tasks * asynchronously. This has several advantages, including easy * resolution of various deadlocks. * * The taskq must be single threaded to ensure tasks are always * processed in the order in which they were dispatched. * * A taskq per pool allows one to keep the pools independent. * This way if one pool is suspended, it will not impact another. * * The preferred location to dispatch a zvol minor task is a sync * task. In this context, there is easy access to the spa_t and minimal * error handling is required because the sync task must succeed. */ spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1, INT_MAX, 0); + /* + * The taskq to preload metaslabs. + */ + spa->spa_metaslab_taskq = taskq_create("z_metaslab", + metaslab_preload_pct, maxclsyspri, 1, INT_MAX, + TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); + /* * Taskq dedicated to prefetcher threads: this is used to prevent the * pool traverse code from monopolizing the global (and limited) * system_taskq by inappropriately scheduling long running tasks on it. */ spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); /* * The taskq to upgrade datasets in this pool. Currently used by * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. */ spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); } /* * Opposite of spa_activate(). */ static void spa_deactivate(spa_t *spa) { ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); spa_evicting_os_wait(spa); if (spa->spa_zvol_taskq) { taskq_destroy(spa->spa_zvol_taskq); spa->spa_zvol_taskq = NULL; } + if (spa->spa_metaslab_taskq) { + taskq_destroy(spa->spa_metaslab_taskq); + spa->spa_metaslab_taskq = NULL; + } + if (spa->spa_prefetch_taskq) { taskq_destroy(spa->spa_prefetch_taskq); spa->spa_prefetch_taskq = NULL; } if (spa->spa_upgrade_taskq) { taskq_destroy(spa->spa_upgrade_taskq); spa->spa_upgrade_taskq = NULL; } txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_state_dirty_list); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_fini(spa, t, q); } } for (size_t i = 0; i < TXG_SIZE; i++) { ASSERT3P(spa->spa_txg_zio[i], !=, NULL); VERIFY0(zio_wait(spa->spa_txg_zio[i])); spa->spa_txg_zio[i] = NULL; } metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; metaslab_class_destroy(spa->spa_log_class); spa->spa_log_class = NULL; metaslab_class_destroy(spa->spa_embedded_log_class); spa->spa_embedded_log_class = NULL; metaslab_class_destroy(spa->spa_special_class); spa->spa_special_class = NULL; metaslab_class_destroy(spa->spa_dedup_class); spa->spa_dedup_class = NULL; /* * If this was part of an import or the open otherwise failed, we may * still have errors left in the queues. Empty them just in case. */ spa_errlog_drain(spa); avl_destroy(&spa->spa_errlist_scrub); avl_destroy(&spa->spa_errlist_last); avl_destroy(&spa->spa_errlist_healed); spa_keystore_fini(&spa->spa_keystore); spa->spa_state = POOL_STATE_UNINITIALIZED; mutex_enter(&spa->spa_proc_lock); if (spa->spa_proc_state != SPA_PROC_NONE) { ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); spa->spa_proc_state = SPA_PROC_DEACTIVATE; cv_broadcast(&spa->spa_proc_cv); while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { ASSERT(spa->spa_proc != &p0); cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_GONE); spa->spa_proc_state = SPA_PROC_NONE; } ASSERT(spa->spa_proc == &p0); mutex_exit(&spa->spa_proc_lock); /* * We want to make sure spa_thread() has actually exited the ZFS * module, so that the module can't be unloaded out from underneath * it. */ if (spa->spa_did != 0) { thread_join(spa->spa_did); spa->spa_did = 0; } spa_deactivate_os(spa); } /* * Verify a pool configuration, and construct the vdev tree appropriately. This * will create all the necessary vdevs in the appropriate layout, with each vdev * in the CLOSED state. This will prep the pool before open/creation/import. * All vdev validation is done by the vdev_alloc() routine. */ int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) { nvlist_t **child; uint_t children; int error; if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) return (error); if ((*vdp)->vdev_ops->vdev_op_leaf) return (0); error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children); if (error == ENOENT) return (0); if (error) { vdev_free(*vdp); *vdp = NULL; return (SET_ERROR(EINVAL)); } for (int c = 0; c < children; c++) { vdev_t *vd; if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, atype)) != 0) { vdev_free(*vdp); *vdp = NULL; return (error); } } ASSERT(*vdp != NULL); return (0); } static boolean_t spa_should_flush_logs_on_unload(spa_t *spa) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return (B_FALSE); if (!spa_writeable(spa)) return (B_FALSE); if (!spa->spa_sync_on) return (B_FALSE); if (spa_state(spa) != POOL_STATE_EXPORTED) return (B_FALSE); if (zfs_keep_log_spacemaps_at_export) return (B_FALSE); return (B_TRUE); } /* * Opens a transaction that will set the flag that will instruct * spa_sync to attempt to flush all the metaslabs for that txg. */ static void spa_unload_log_sm_flush_all(spa_t *spa) { dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); ASSERT3U(spa->spa_log_flushall_txg, ==, 0); spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); dmu_tx_commit(tx); txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); } static void spa_unload_log_sm_metadata(spa_t *spa) { void *cookie = NULL; spa_log_sm_t *sls; log_summary_entry_t *e; while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, &cookie)) != NULL) { VERIFY0(sls->sls_mscount); kmem_free(sls, sizeof (spa_log_sm_t)); } while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) { VERIFY0(e->lse_mscount); kmem_free(e, sizeof (log_summary_entry_t)); } spa->spa_unflushed_stats.sus_nblocks = 0; spa->spa_unflushed_stats.sus_memused = 0; spa->spa_unflushed_stats.sus_blocklimit = 0; } static void spa_destroy_aux_threads(spa_t *spa) { if (spa->spa_condense_zthr != NULL) { zthr_destroy(spa->spa_condense_zthr); spa->spa_condense_zthr = NULL; } if (spa->spa_checkpoint_discard_zthr != NULL) { zthr_destroy(spa->spa_checkpoint_discard_zthr); spa->spa_checkpoint_discard_zthr = NULL; } if (spa->spa_livelist_delete_zthr != NULL) { zthr_destroy(spa->spa_livelist_delete_zthr); spa->spa_livelist_delete_zthr = NULL; } if (spa->spa_livelist_condense_zthr != NULL) { zthr_destroy(spa->spa_livelist_condense_zthr); spa->spa_livelist_condense_zthr = NULL; } } /* * Opposite of spa_load(). */ static void spa_unload(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); spa_import_progress_remove(spa_guid(spa)); spa_load_note(spa, "UNLOADING"); spa_wake_waiters(spa); /* * If we have set the spa_final_txg, we have already performed the * tasks below in spa_export_common(). We should not redo it here since * we delay the final TXGs beyond what spa_final_txg is set at. */ if (spa->spa_final_txg == UINT64_MAX) { /* * If the log space map feature is enabled and the pool is * getting exported (but not destroyed), we want to spend some * time flushing as many metaslabs as we can in an attempt to * destroy log space maps and save import time. */ if (spa_should_flush_logs_on_unload(spa)) spa_unload_log_sm_flush_all(spa); /* * Stop async tasks. */ spa_async_suspend(spa); if (spa->spa_root_vdev) { vdev_t *root_vdev = spa->spa_root_vdev; vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); vdev_rebuild_stop_all(spa); } } /* * Stop syncing. */ if (spa->spa_sync_on) { txg_sync_stop(spa->spa_dsl_pool); spa->spa_sync_on = B_FALSE; } /* * This ensures that there is no async metaslab prefetching * while we attempt to unload the spa. */ - if (spa->spa_root_vdev != NULL) { - for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { - vdev_t *vc = spa->spa_root_vdev->vdev_child[c]; - if (vc->vdev_mg != NULL) - taskq_wait(vc->vdev_mg->mg_taskq); - } - } + taskq_wait(spa->spa_metaslab_taskq); if (spa->spa_mmp.mmp_thread) mmp_thread_stop(spa); /* * Wait for any outstanding async I/O to complete. */ if (spa->spa_async_zio_root != NULL) { for (int i = 0; i < max_ncpus; i++) (void) zio_wait(spa->spa_async_zio_root[i]); kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); spa->spa_async_zio_root = NULL; } if (spa->spa_vdev_removal != NULL) { spa_vdev_removal_destroy(spa->spa_vdev_removal); spa->spa_vdev_removal = NULL; } spa_destroy_aux_threads(spa); spa_condense_fini(spa); bpobj_close(&spa->spa_deferred_bpobj); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); /* * Close all vdevs. */ if (spa->spa_root_vdev) vdev_free(spa->spa_root_vdev); ASSERT(spa->spa_root_vdev == NULL); /* * Close the dsl pool. */ if (spa->spa_dsl_pool) { dsl_pool_close(spa->spa_dsl_pool); spa->spa_dsl_pool = NULL; spa->spa_meta_objset = NULL; } ddt_unload(spa); brt_unload(spa); spa_unload_log_sm_metadata(spa); /* * Drop and purge level 2 cache */ spa_l2cache_drop(spa); if (spa->spa_spares.sav_vdevs) { for (int i = 0; i < spa->spa_spares.sav_count; i++) vdev_free(spa->spa_spares.sav_vdevs[i]); kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); spa->spa_spares.sav_vdevs = NULL; } if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; } spa->spa_spares.sav_count = 0; if (spa->spa_l2cache.sav_vdevs) { for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); vdev_free(spa->spa_l2cache.sav_vdevs[i]); } kmem_free(spa->spa_l2cache.sav_vdevs, spa->spa_l2cache.sav_count * sizeof (void *)); spa->spa_l2cache.sav_vdevs = NULL; } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; } spa->spa_l2cache.sav_count = 0; spa->spa_async_suspended = 0; spa->spa_indirect_vdevs_loaded = B_FALSE; if (spa->spa_comment != NULL) { spa_strfree(spa->spa_comment); spa->spa_comment = NULL; } if (spa->spa_compatibility != NULL) { spa_strfree(spa->spa_compatibility); spa->spa_compatibility = NULL; } spa_config_exit(spa, SCL_ALL, spa); } /* * Load (or re-load) the current list of vdevs describing the active spares for * this pool. When this is called, we have some form of basic information in * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. */ void spa_load_spares(spa_t *spa) { nvlist_t **spares; uint_t nspares; int i; vdev_t *vd, *tvd; #ifndef _KERNEL /* * zdb opens both the current state of the pool and the * checkpointed state (if present), with a different spa_t. * * As spare vdevs are shared among open pools, we skip loading * them when we load the checkpointed state of the pool. */ if (!spa_writeable(spa)) return; #endif ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * First, close and free any existing spare vdevs. */ if (spa->spa_spares.sav_vdevs) { for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; /* Undo the call to spa_activate() below */ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL && tvd->vdev_isspare) spa_spare_remove(tvd); vdev_close(vd); vdev_free(vd); } kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); } if (spa->spa_spares.sav_config == NULL) nspares = 0; else VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares)); spa->spa_spares.sav_count = (int)nspares; spa->spa_spares.sav_vdevs = NULL; if (nspares == 0) return; /* * Construct the array of vdevs, opening them to get status in the * process. For each spare, there is potentially two different vdev_t * structures associated with it: one in the list of spares (used only * for basic validation purposes) and one in the active vdev * configuration (if it's spared in). During this phase we open and * validate each vdev on the spare list. If the vdev also exists in the * active configuration, then we also mark this vdev as an active spare. */ spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) { VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, VDEV_ALLOC_SPARE) == 0); ASSERT(vd != NULL); spa->spa_spares.sav_vdevs[i] = vd; if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL) { if (!tvd->vdev_isspare) spa_spare_add(tvd); /* * We only mark the spare active if we were successfully * able to load the vdev. Otherwise, importing a pool * with a bad active spare would result in strange * behavior, because multiple pool would think the spare * is actively in use. * * There is a vulnerability here to an equally bizarre * circumstance, where a dead active spare is later * brought back to life (onlined or otherwise). Given * the rarity of this scenario, and the extra complexity * it adds, we ignore the possibility. */ if (!vdev_is_dead(tvd)) spa_spare_activate(tvd); } vd->vdev_top = vd; vd->vdev_aux = &spa->spa_spares; if (vdev_open(vd) != 0) continue; if (vdev_validate_aux(vd) == 0) spa_spare_add(vd); } /* * Recompute the stashed list of spares, with status information * this time. */ fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) spares[i] = vdev_config_generate(spa, spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); fnvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, spa->spa_spares.sav_count); for (i = 0; i < spa->spa_spares.sav_count; i++) nvlist_free(spares[i]); kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); } /* * Load (or re-load) the current list of vdevs describing the active l2cache for * this pool. When this is called, we have some form of basic information in * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. * Devices which are already active have their details maintained, and are * not re-opened. */ void spa_load_l2cache(spa_t *spa) { nvlist_t **l2cache = NULL; uint_t nl2cache; int i, j, oldnvdevs; uint64_t guid; vdev_t *vd, **oldvdevs, **newvdevs; spa_aux_vdev_t *sav = &spa->spa_l2cache; #ifndef _KERNEL /* * zdb opens both the current state of the pool and the * checkpointed state (if present), with a different spa_t. * * As L2 caches are part of the ARC which is shared among open * pools, we skip loading them when we load the checkpointed * state of the pool. */ if (!spa_writeable(spa)) return; #endif ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); oldvdevs = sav->sav_vdevs; oldnvdevs = sav->sav_count; sav->sav_vdevs = NULL; sav->sav_count = 0; if (sav->sav_config == NULL) { nl2cache = 0; newvdevs = NULL; goto out; } VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); /* * Process new nvlist of vdevs. */ for (i = 0; i < nl2cache; i++) { guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); newvdevs[i] = NULL; for (j = 0; j < oldnvdevs; j++) { vd = oldvdevs[j]; if (vd != NULL && guid == vd->vdev_guid) { /* * Retain previous vdev for add/remove ops. */ newvdevs[i] = vd; oldvdevs[j] = NULL; break; } } if (newvdevs[i] == NULL) { /* * Create new vdev */ VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, VDEV_ALLOC_L2CACHE) == 0); ASSERT(vd != NULL); newvdevs[i] = vd; /* * Commit this vdev as an l2cache device, * even if it fails to open. */ spa_l2cache_add(vd); vd->vdev_top = vd; vd->vdev_aux = sav; spa_l2cache_activate(vd); if (vdev_open(vd) != 0) continue; (void) vdev_validate_aux(vd); if (!vdev_is_dead(vd)) l2arc_add_vdev(spa, vd); /* * Upon cache device addition to a pool or pool * creation with a cache device or if the header * of the device is invalid we issue an async * TRIM command for the whole device which will * execute if l2arc_trim_ahead > 0. */ spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } sav->sav_vdevs = newvdevs; sav->sav_count = (int)nl2cache; /* * Recompute the stashed list of l2cache devices, with status * information this time. */ fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE); if (sav->sav_count > 0) l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, sav->sav_count); out: /* * Purge vdevs that were dropped */ if (oldvdevs) { for (i = 0; i < oldnvdevs; i++) { uint64_t pool; vd = oldvdevs[i]; if (vd != NULL) { ASSERT(vd->vdev_isl2cache); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); vdev_clear_stats(vd); vdev_free(vd); } } kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); } for (i = 0; i < sav->sav_count; i++) nvlist_free(l2cache[i]); if (sav->sav_count) kmem_free(l2cache, sav->sav_count * sizeof (void *)); } static int load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) { dmu_buf_t *db; char *packed = NULL; size_t nvsize = 0; int error; *value = NULL; error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); if (error) return (error); nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); packed = vmem_alloc(nvsize, KM_SLEEP); error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, DMU_READ_PREFETCH); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); vmem_free(packed, nvsize); return (error); } /* * Concrete top-level vdevs that are not missing and are not logs. At every * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. */ static uint64_t spa_healthy_core_tvds(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; uint64_t tvds = 0; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; if (vd->vdev_islog) continue; if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) tvds++; } return (tvds); } /* * Checks to see if the given vdev could not be opened, in which case we post a * sysevent to notify the autoreplace code that the device has been removed. */ static void spa_check_removed(vdev_t *vd) { for (uint64_t c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && vdev_is_concrete(vd)) { zfs_post_autoreplace(vd->vdev_spa, vd); spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); } } static int spa_check_for_missing_logs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; /* * If we're doing a normal import, then build up any additional * diagnostic information about missing log devices. * We'll pass this up to the user for further processing. */ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { nvlist_t **child, *nv; uint64_t idx = 0; child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); nv = fnvlist_alloc(); for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; /* * We consider a device as missing only if it failed * to open (i.e. offline or faulted is not considered * as missing). */ if (tvd->vdev_islog && tvd->vdev_state == VDEV_STATE_CANT_OPEN) { child[idx++] = vdev_config_generate(spa, tvd, B_FALSE, VDEV_CONFIG_MISSING); } } if (idx > 0) { fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, (const nvlist_t * const *)child, idx); fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_MISSING_DEVICES, nv); for (uint64_t i = 0; i < idx; i++) nvlist_free(child[i]); } nvlist_free(nv); kmem_free(child, rvd->vdev_children * sizeof (char **)); if (idx > 0) { spa_load_failed(spa, "some log devices are missing"); vdev_dbgmsg_print_tree(rvd, 2); return (SET_ERROR(ENXIO)); } } else { for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_islog && tvd->vdev_state == VDEV_STATE_CANT_OPEN) { spa_set_log_state(spa, SPA_LOG_CLEAR); spa_load_note(spa, "some log devices are " "missing, ZIL is dropped."); vdev_dbgmsg_print_tree(rvd, 2); break; } } } return (0); } /* * Check for missing log devices */ static boolean_t spa_check_logs(spa_t *spa) { boolean_t rv = B_FALSE; dsl_pool_t *dp = spa_get_dsl(spa); switch (spa->spa_log_state) { default: break; case SPA_LOG_MISSING: /* need to recheck in case slog has been restored */ case SPA_LOG_UNKNOWN: rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); if (rv) spa_set_log_state(spa, SPA_LOG_MISSING); break; } return (rv); } /* * Passivate any log vdevs (note, does not apply to embedded log metaslabs). */ static boolean_t spa_passivate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; boolean_t slog_found = B_FALSE; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_islog) { ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_passivate(tvd->vdev_mg); slog_found = B_TRUE; } } return (slog_found); } /* * Activate any log vdevs (note, does not apply to embedded log metaslabs). */ static void spa_activate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_islog) { ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_activate(tvd->vdev_mg); } } } int spa_reset_logs(spa_t *spa) { int error; error = dmu_objset_find(spa_name(spa), zil_reset, NULL, DS_FIND_CHILDREN); if (error == 0) { /* * We successfully offlined the log device, sync out the * current txg so that the "stubby" block can be removed * by zil_sync(). */ txg_wait_synced(spa->spa_dsl_pool, 0); } return (error); } static void spa_aux_check_removed(spa_aux_vdev_t *sav) { for (int i = 0; i < sav->sav_count; i++) spa_check_removed(sav->sav_vdevs[i]); } void spa_claim_notify(zio_t *zio) { spa_t *spa = zio->io_spa; if (zio->io_error) return; mutex_enter(&spa->spa_props_lock); /* any mutex will do */ if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) spa->spa_claim_max_txg = zio->io_bp->blk_birth; mutex_exit(&spa->spa_props_lock); } typedef struct spa_load_error { boolean_t sle_verify_data; uint64_t sle_meta_count; uint64_t sle_data_count; } spa_load_error_t; static void spa_load_verify_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; spa_load_error_t *sle = zio->io_private; dmu_object_type_t type = BP_GET_TYPE(bp); int error = zio->io_error; spa_t *spa = zio->io_spa; abd_free(zio->io_abd); if (error) { if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) atomic_inc_64(&sle->sle_meta_count); else atomic_inc_64(&sle->sle_data_count); } mutex_enter(&spa->spa_scrub_lock); spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } /* * Maximum number of inflight bytes is the log2 fraction of the arc size. * By default, we set it to 1/16th of the arc. */ static uint_t spa_load_verify_shift = 4; static int spa_load_verify_metadata = B_TRUE; static int spa_load_verify_data = B_TRUE; static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { zio_t *rio = arg; spa_load_error_t *sle = rio->io_private; (void) zilog, (void) dnp; /* * Note: normally this routine will not be called if * spa_load_verify_metadata is not set. However, it may be useful * to manually set the flag after the traversal has begun. */ if (!spa_load_verify_metadata) return (0); /* * Sanity check the block pointer in order to detect obvious damage * before using the contents in subsequent checks or in zio_read(). * When damaged consider it to be a metadata error since we cannot * trust the BP_GET_TYPE and BP_GET_LEVEL values. */ if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { atomic_inc_64(&sle->sle_meta_count); return (0); } if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) return (0); if (!BP_IS_METADATA(bp) && (!spa_load_verify_data || !sle->sle_verify_data)) return (0); uint64_t maxinflight_bytes = arc_target_bytes() >> spa_load_verify_shift; size_t size = BP_GET_PSIZE(bp); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_load_verify_bytes >= maxinflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_load_verify_bytes += size; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); return (0); } static int verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { (void) dp, (void) arg; if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int spa_load_verify(spa_t *spa) { zio_t *rio; spa_load_error_t sle = { 0 }; zpool_load_policy_t policy; boolean_t verify_ok = B_FALSE; int error = 0; zpool_get_load_policy(spa->spa_config, &policy); if (policy.zlp_rewind & ZPOOL_NEVER_REWIND || policy.zlp_maxmeta == UINT64_MAX) return (0); dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); error = dmu_objset_find_dp(spa->spa_dsl_pool, spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, DS_FIND_CHILDREN); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); if (error != 0) return (error); /* * Verify data only if we are rewinding or error limit was set. * Otherwise nothing except dbgmsg care about it to waste time. */ sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) || (policy.zlp_maxdata < UINT64_MAX); rio = zio_root(spa, NULL, &sle, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); if (spa_load_verify_metadata) { if (spa->spa_extreme_rewind) { spa_load_note(spa, "performing a complete scan of the " "pool since extreme rewind is on. This may take " "a very long time.\n (spa_load_verify_data=%u, " "spa_load_verify_metadata=%u)", spa_load_verify_data, spa_load_verify_metadata); } error = traverse_pool(spa, spa->spa_verify_min_txg, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); } (void) zio_wait(rio); ASSERT0(spa->spa_load_verify_bytes); spa->spa_load_meta_errors = sle.sle_meta_count; spa->spa_load_data_errors = sle.sle_data_count; if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { spa_load_note(spa, "spa_load_verify found %llu metadata errors " "and %llu data errors", (u_longlong_t)sle.sle_meta_count, (u_longlong_t)sle.sle_data_count); } if (spa_load_verify_dryrun || (!error && sle.sle_meta_count <= policy.zlp_maxmeta && sle.sle_data_count <= policy.zlp_maxdata)) { int64_t loss = 0; verify_ok = B_TRUE; spa->spa_load_txg = spa->spa_uberblock.ub_txg; spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts); fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, loss); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); } else { spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } if (spa_load_verify_dryrun) return (0); if (error) { if (error != ENXIO && error != EIO) error = SET_ERROR(EIO); return (error); } return (verify_ok ? 0 : EIO); } /* * Find a value in the pool props object. */ static void spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) { (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); } /* * Find a value in the pool directory object. */ static int spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) { int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, val); if (error != 0 && (error != ENOENT || log_enoent)) { spa_load_failed(spa, "couldn't get '%s' value in MOS directory " "[error=%d]", name, error); } return (error); } static int spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) { vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); return (SET_ERROR(err)); } boolean_t spa_livelist_delete_check(spa_t *spa) { return (spa->spa_livelists_to_delete != 0); } static boolean_t spa_livelist_delete_cb_check(void *arg, zthr_t *z) { (void) z; spa_t *spa = arg; return (spa_livelist_delete_check(spa)); } static int delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { spa_t *spa = arg; zio_free(spa, tx->tx_txg, bp); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); return (0); } static int dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) { int err; zap_cursor_t zc; zap_attribute_t za; zap_cursor_init(&zc, os, zap_obj); err = zap_cursor_retrieve(&zc, &za); zap_cursor_fini(&zc); if (err == 0) *llp = za.za_first_integer; return (err); } /* * Components of livelist deletion that must be performed in syncing * context: freeing block pointers and updating the pool-wide data * structures to indicate how much work is left to do */ typedef struct sublist_delete_arg { spa_t *spa; dsl_deadlist_t *ll; uint64_t key; bplist_t *to_free; } sublist_delete_arg_t; static void sublist_delete_sync(void *arg, dmu_tx_t *tx) { sublist_delete_arg_t *sda = arg; spa_t *spa = sda->spa; dsl_deadlist_t *ll = sda->ll; uint64_t key = sda->key; bplist_t *to_free = sda->to_free; bplist_iterate(to_free, delete_blkptr_cb, spa, tx); dsl_deadlist_remove_entry(ll, key, tx); } typedef struct livelist_delete_arg { spa_t *spa; uint64_t ll_obj; uint64_t zap_obj; } livelist_delete_arg_t; static void livelist_delete_sync(void *arg, dmu_tx_t *tx) { livelist_delete_arg_t *lda = arg; spa_t *spa = lda->spa; uint64_t ll_obj = lda->ll_obj; uint64_t zap_obj = lda->zap_obj; objset_t *mos = spa->spa_meta_objset; uint64_t count; /* free the livelist and decrement the feature count */ VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); dsl_deadlist_free(mos, ll_obj, tx); spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); VERIFY0(zap_count(mos, zap_obj, &count)); if (count == 0) { /* no more livelists to delete */ VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DELETED_CLONES, tx)); VERIFY0(zap_destroy(mos, zap_obj, tx)); spa->spa_livelists_to_delete = 0; spa_notify_waiters(spa); } } /* * Load in the value for the livelist to be removed and open it. Then, * load its first sublist and determine which block pointers should actually * be freed. Then, call a synctask which performs the actual frees and updates * the pool-wide livelist data. */ static void spa_livelist_delete_cb(void *arg, zthr_t *z) { spa_t *spa = arg; uint64_t ll_obj = 0, count; objset_t *mos = spa->spa_meta_objset; uint64_t zap_obj = spa->spa_livelists_to_delete; /* * Determine the next livelist to delete. This function should only * be called if there is at least one deleted clone. */ VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); VERIFY0(zap_count(mos, ll_obj, &count)); if (count > 0) { dsl_deadlist_t *ll; dsl_deadlist_entry_t *dle; bplist_t to_free; ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); dsl_deadlist_open(ll, mos, ll_obj); dle = dsl_deadlist_first(ll); ASSERT3P(dle, !=, NULL); bplist_create(&to_free); int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, z, NULL); if (err == 0) { sublist_delete_arg_t sync_arg = { .spa = spa, .ll = ll, .key = dle->dle_mintxg, .to_free = &to_free }; zfs_dbgmsg("deleting sublist (id %llu) from" " livelist %llu, %lld remaining", (u_longlong_t)dle->dle_bpobj.bpo_object, (u_longlong_t)ll_obj, (longlong_t)count - 1); VERIFY0(dsl_sync_task(spa_name(spa), NULL, sublist_delete_sync, &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); } else { VERIFY3U(err, ==, EINTR); } bplist_clear(&to_free); bplist_destroy(&to_free); dsl_deadlist_close(ll); kmem_free(ll, sizeof (dsl_deadlist_t)); } else { livelist_delete_arg_t sync_arg = { .spa = spa, .ll_obj = ll_obj, .zap_obj = zap_obj }; zfs_dbgmsg("deletion of livelist %llu completed", (u_longlong_t)ll_obj); VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); } } static void spa_start_livelist_destroy_thread(spa_t *spa) { ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); spa->spa_livelist_delete_zthr = zthr_create("z_livelist_destroy", spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, minclsyspri); } typedef struct livelist_new_arg { bplist_t *allocs; bplist_t *frees; } livelist_new_arg_t; static int livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(tx == NULL); livelist_new_arg_t *lna = arg; if (bp_freed) { bplist_append(lna->frees, bp); } else { bplist_append(lna->allocs, bp); zfs_livelist_condense_new_alloc++; } return (0); } typedef struct livelist_condense_arg { spa_t *spa; bplist_t to_keep; uint64_t first_size; uint64_t next_size; } livelist_condense_arg_t; static void spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) { livelist_condense_arg_t *lca = arg; spa_t *spa = lca->spa; bplist_t new_frees; dsl_dataset_t *ds = spa->spa_to_condense.ds; /* Have we been cancelled? */ if (spa->spa_to_condense.cancelled) { zfs_livelist_condense_sync_cancel++; goto out; } dsl_deadlist_entry_t *first = spa->spa_to_condense.first; dsl_deadlist_entry_t *next = spa->spa_to_condense.next; dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; /* * It's possible that the livelist was changed while the zthr was * running. Therefore, we need to check for new blkptrs in the two * entries being condensed and continue to track them in the livelist. * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), * it's possible that the newly added blkptrs are FREEs or ALLOCs so * we need to sort them into two different bplists. */ uint64_t first_obj = first->dle_bpobj.bpo_object; uint64_t next_obj = next->dle_bpobj.bpo_object; uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; bplist_create(&new_frees); livelist_new_arg_t new_bps = { .allocs = &lca->to_keep, .frees = &new_frees, }; if (cur_first_size > lca->first_size) { VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, livelist_track_new_cb, &new_bps, lca->first_size)); } if (cur_next_size > lca->next_size) { VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, livelist_track_new_cb, &new_bps, lca->next_size)); } dsl_deadlist_clear_entry(first, ll, tx); ASSERT(bpobj_is_empty(&first->dle_bpobj)); dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); bplist_destroy(&new_frees); char dsname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds, dsname); zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname, (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj, (u_longlong_t)cur_first_size, (u_longlong_t)next_obj, (u_longlong_t)cur_next_size, (u_longlong_t)first->dle_bpobj.bpo_object, (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs); out: dmu_buf_rele(ds->ds_dbuf, spa); spa->spa_to_condense.ds = NULL; bplist_clear(&lca->to_keep); bplist_destroy(&lca->to_keep); kmem_free(lca, sizeof (livelist_condense_arg_t)); spa->spa_to_condense.syncing = B_FALSE; } static void spa_livelist_condense_cb(void *arg, zthr_t *t) { while (zfs_livelist_condense_zthr_pause && !(zthr_has_waiters(t) || zthr_iscancelled(t))) delay(1); spa_t *spa = arg; dsl_deadlist_entry_t *first = spa->spa_to_condense.first; dsl_deadlist_entry_t *next = spa->spa_to_condense.next; uint64_t first_size, next_size; livelist_condense_arg_t *lca = kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); bplist_create(&lca->to_keep); /* * Process the livelists (matching FREEs and ALLOCs) in open context * so we have minimal work in syncing context to condense. * * We save bpobj sizes (first_size and next_size) to use later in * syncing context to determine if entries were added to these sublists * while in open context. This is possible because the clone is still * active and open for normal writes and we want to make sure the new, * unprocessed blockpointers are inserted into the livelist normally. * * Note that dsl_process_sub_livelist() both stores the size number of * blockpointers and iterates over them while the bpobj's lock held, so * the sizes returned to us are consistent which what was actually * processed. */ int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, &first_size); if (err == 0) err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, t, &next_size); if (err == 0) { while (zfs_livelist_condense_sync_pause && !(zthr_has_waiters(t) || zthr_iscancelled(t))) delay(1); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); dmu_tx_mark_netfree(tx); dmu_tx_hold_space(tx, 1); err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); if (err == 0) { /* * Prevent the condense zthr restarting before * the synctask completes. */ spa->spa_to_condense.syncing = B_TRUE; lca->spa = spa; lca->first_size = first_size; lca->next_size = next_size; dsl_sync_task_nowait(spa_get_dsl(spa), spa_livelist_condense_sync, lca, tx); dmu_tx_commit(tx); return; } } /* * Condensing can not continue: either it was externally stopped or * we were unable to assign to a tx because the pool has run out of * space. In the second case, we'll just end up trying to condense * again in a later txg. */ ASSERT(err != 0); bplist_clear(&lca->to_keep); bplist_destroy(&lca->to_keep); kmem_free(lca, sizeof (livelist_condense_arg_t)); dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); spa->spa_to_condense.ds = NULL; if (err == EINTR) zfs_livelist_condense_zthr_cancel++; } /* * Check that there is something to condense but that a condense is not * already in progress and that condensing has not been cancelled. */ static boolean_t spa_livelist_condense_cb_check(void *arg, zthr_t *z) { (void) z; spa_t *spa = arg; if ((spa->spa_to_condense.ds != NULL) && (spa->spa_to_condense.syncing == B_FALSE) && (spa->spa_to_condense.cancelled == B_FALSE)) { return (B_TRUE); } return (B_FALSE); } static void spa_start_livelist_condensing_thread(spa_t *spa) { spa->spa_to_condense.ds = NULL; spa->spa_to_condense.first = NULL; spa->spa_to_condense.next = NULL; spa->spa_to_condense.syncing = B_FALSE; spa->spa_to_condense.cancelled = B_FALSE; ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); spa->spa_livelist_condense_zthr = zthr_create("z_livelist_condense", spa_livelist_condense_cb_check, spa_livelist_condense_cb, spa, minclsyspri); } static void spa_spawn_aux_threads(spa_t *spa) { ASSERT(spa_writeable(spa)); ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_start_indirect_condensing_thread(spa); spa_start_livelist_destroy_thread(spa); spa_start_livelist_condensing_thread(spa); ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); spa->spa_checkpoint_discard_zthr = zthr_create("z_checkpoint_discard", spa_checkpoint_discard_thread_check, spa_checkpoint_discard_thread, spa, minclsyspri); } /* * Fix up config after a partly-completed split. This is done with the * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off * pool have that entry in their config, but only the splitting one contains * a list of all the guids of the vdevs that are being split off. * * This function determines what to do with that list: either rejoin * all the disks to the pool, or complete the splitting process. To attempt * the rejoin, each disk that is offlined is marked online again, and * we do a reopen() call. If the vdev label for every disk that was * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) * then we call vdev_split() on each disk, and complete the split. * * Otherwise we leave the config alone, with all the vdevs in place in * the original pool. */ static void spa_try_repair(spa_t *spa, nvlist_t *config) { uint_t extracted; uint64_t *glist; uint_t i, gcount; nvlist_t *nvl; vdev_t **vd; boolean_t attempt_reopen; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) return; /* check that the config is complete */ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, &glist, &gcount) != 0) return; vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); /* attempt to online all the vdevs & validate */ attempt_reopen = B_TRUE; for (i = 0; i < gcount; i++) { if (glist[i] == 0) /* vdev is hole */ continue; vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); if (vd[i] == NULL) { /* * Don't bother attempting to reopen the disks; * just do the split. */ attempt_reopen = B_FALSE; } else { /* attempt to re-online it */ vd[i]->vdev_offline = B_FALSE; } } if (attempt_reopen) { vdev_reopen(spa->spa_root_vdev); /* check each device to see what state it's in */ for (extracted = 0, i = 0; i < gcount; i++) { if (vd[i] != NULL && vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) break; ++extracted; } } /* * If every disk has been moved to the new pool, or if we never * even attempted to look at them, then we split them off for * good. */ if (!attempt_reopen || gcount == extracted) { for (i = 0; i < gcount; i++) if (vd[i] != NULL) vdev_split(vd[i]); vdev_reopen(spa->spa_root_vdev); } kmem_free(vd, gcount * sizeof (vdev_t *)); } static int spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) { const char *ereport = FM_EREPORT_ZFS_POOL; int error; spa->spa_load_state = state; (void) spa_import_progress_set_state(spa_guid(spa), spa_load_state(spa)); gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, type, &ereport); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); if (error) { if (error != EEXIST) { spa->spa_loaded_ts.tv_sec = 0; spa->spa_loaded_ts.tv_nsec = 0; } if (error != EBADF) { (void) zfs_ereport_post(ereport, spa, NULL, NULL, NULL, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; spa->spa_ena = 0; (void) spa_import_progress_set_state(spa_guid(spa), spa_load_state(spa)); return (error); } #ifdef ZFS_DEBUG /* * Count the number of per-vdev ZAPs associated with all of the vdevs in the * vdev tree rooted in the given vd, and ensure that each ZAP is present in the * spa's per-vdev ZAP list. */ static uint64_t vdev_count_verify_zaps(vdev_t *vd) { spa_t *spa = vd->vdev_spa; uint64_t total = 0; if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) && vd->vdev_root_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_root_zap)); } if (vd->vdev_top_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_top_zap)); } if (vd->vdev_leaf_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); } for (uint64_t i = 0; i < vd->vdev_children; i++) { total += vdev_count_verify_zaps(vd->vdev_child[i]); } return (total); } #else #define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0) #endif /* * Determine whether the activity check is required. */ static boolean_t spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, nvlist_t *config) { uint64_t state = 0; uint64_t hostid = 0; uint64_t tryconfig_txg = 0; uint64_t tryconfig_timestamp = 0; uint16_t tryconfig_mmp_seq = 0; nvlist_t *nvinfo; if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, &tryconfig_txg); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, &tryconfig_timestamp); (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, &tryconfig_mmp_seq); } (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); /* * Disable the MMP activity check - This is used by zdb which * is intended to be used on potentially active pools. */ if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) return (B_FALSE); /* * Skip the activity check when the MMP feature is disabled. */ if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) return (B_FALSE); /* * If the tryconfig_ values are nonzero, they are the results of an * earlier tryimport. If they all match the uberblock we just found, * then the pool has not changed and we return false so we do not test * a second time. */ if (tryconfig_txg && tryconfig_txg == ub->ub_txg && tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && tryconfig_mmp_seq && tryconfig_mmp_seq == (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) return (B_FALSE); /* * Allow the activity check to be skipped when importing the pool * on the same host which last imported it. Since the hostid from * configuration may be stale use the one read from the label. */ if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); if (hostid == spa_get_hostid(spa)) return (B_FALSE); /* * Skip the activity test when the pool was cleanly exported. */ if (state != POOL_STATE_ACTIVE) return (B_FALSE); return (B_TRUE); } /* * Nanoseconds the activity check must watch for changes on-disk. */ static uint64_t spa_activity_check_duration(spa_t *spa, uberblock_t *ub) { uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); uint64_t multihost_interval = MSEC2NSEC( MMP_INTERVAL_OK(zfs_multihost_interval)); uint64_t import_delay = MAX(NANOSEC, import_intervals * multihost_interval); /* * Local tunables determine a minimum duration except for the case * where we know when the remote host will suspend the pool if MMP * writes do not land. * * See Big Theory comment at the top of mmp.c for the reasoning behind * these cases and times. */ ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && MMP_FAIL_INT(ub) > 0) { /* MMP on remote host will suspend pool after failed writes */ import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * MMP_IMPORT_SAFETY_FACTOR / 100; zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " "mmp_fails=%llu ub_mmp mmp_interval=%llu " "import_intervals=%llu", (u_longlong_t)import_delay, (u_longlong_t)MMP_FAIL_INT(ub), (u_longlong_t)MMP_INTERVAL(ub), (u_longlong_t)import_intervals); } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && MMP_FAIL_INT(ub) == 0) { /* MMP on remote host will never suspend pool */ import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + ub->ub_mmp_delay) * import_intervals); zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " "mmp_interval=%llu ub_mmp_delay=%llu " "import_intervals=%llu", (u_longlong_t)import_delay, (u_longlong_t)MMP_INTERVAL(ub), (u_longlong_t)ub->ub_mmp_delay, (u_longlong_t)import_intervals); } else if (MMP_VALID(ub)) { /* * zfs-0.7 compatibility case */ import_delay = MAX(import_delay, (multihost_interval + ub->ub_mmp_delay) * import_intervals); zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " "import_intervals=%llu leaves=%u", (u_longlong_t)import_delay, (u_longlong_t)ub->ub_mmp_delay, (u_longlong_t)import_intervals, vdev_count_leaves(spa)); } else { /* Using local tunings is the only reasonable option */ zfs_dbgmsg("pool last imported on non-MMP aware " "host using import_delay=%llu multihost_interval=%llu " "import_intervals=%llu", (u_longlong_t)import_delay, (u_longlong_t)multihost_interval, (u_longlong_t)import_intervals); } return (import_delay); } /* * Perform the import activity check. If the user canceled the import or * we detected activity then fail. */ static int spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) { uint64_t txg = ub->ub_txg; uint64_t timestamp = ub->ub_timestamp; uint64_t mmp_config = ub->ub_mmp_config; uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; uint64_t import_delay; hrtime_t import_expire; nvlist_t *mmp_label = NULL; vdev_t *rvd = spa->spa_root_vdev; kcondvar_t cv; kmutex_t mtx; int error = 0; cv_init(&cv, NULL, CV_DEFAULT, NULL); mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); mutex_enter(&mtx); /* * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed * during the earlier tryimport. If the txg recorded there is 0 then * the pool is known to be active on another host. * * Otherwise, the pool might be in use on another host. Check for * changes in the uberblocks on disk if necessary. */ if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { vdev_uberblock_load(rvd, ub, &mmp_label); error = SET_ERROR(EREMOTEIO); goto out; } } import_delay = spa_activity_check_duration(spa, ub); /* Add a small random factor in case of simultaneous imports (0-25%) */ import_delay += import_delay * random_in_range(250) / 1000; import_expire = gethrtime() + import_delay; while (gethrtime() < import_expire) { (void) spa_import_progress_set_mmp_check(spa_guid(spa), NSEC2SEC(import_expire - gethrtime())); vdev_uberblock_load(rvd, ub, &mmp_label); if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { zfs_dbgmsg("multihost activity detected " "txg %llu ub_txg %llu " "timestamp %llu ub_timestamp %llu " "mmp_config %#llx ub_mmp_config %#llx", (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, (u_longlong_t)timestamp, (u_longlong_t)ub->ub_timestamp, (u_longlong_t)mmp_config, (u_longlong_t)ub->ub_mmp_config); error = SET_ERROR(EREMOTEIO); break; } if (mmp_label) { nvlist_free(mmp_label); mmp_label = NULL; } error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); if (error != -1) { error = SET_ERROR(EINTR); break; } error = 0; } out: mutex_exit(&mtx); mutex_destroy(&mtx); cv_destroy(&cv); /* * If the pool is determined to be active store the status in the * spa->spa_load_info nvlist. If the remote hostname or hostid are * available from configuration read from disk store them as well. * This allows 'zpool import' to generate a more useful message. * * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool */ if (error == EREMOTEIO) { const char *hostname = ""; uint64_t hostid = 0; if (mmp_label) { if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { hostname = fnvlist_lookup_string(mmp_label, ZPOOL_CONFIG_HOSTNAME); fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_MMP_HOSTNAME, hostname); } if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { hostid = fnvlist_lookup_uint64(mmp_label, ZPOOL_CONFIG_HOSTID); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_HOSTID, hostid); } } fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_TXG, 0); error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); } if (mmp_label) nvlist_free(mmp_label); return (error); } static int spa_verify_host(spa_t *spa, nvlist_t *mos_config) { uint64_t hostid; const char *hostname; uint64_t myhostid = 0; if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { hostname = fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME); myhostid = zone_get_hostid(NULL); if (hostid != 0 && myhostid != 0 && hostid != myhostid) { cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%llx). " "See: https://openzfs.github.io/openzfs-docs/msg/" "ZFS-8000-EY", spa_name(spa), hostname, (u_longlong_t)hostid); spa_load_failed(spa, "hostid verification failed: pool " "last accessed by host: %s (hostid: 0x%llx)", hostname, (u_longlong_t)hostid); return (SET_ERROR(EBADF)); } } return (0); } static int spa_ld_parse_config(spa_t *spa, spa_import_type_t type) { int error = 0; nvlist_t *nvtree, *nvl, *config = spa->spa_config; int parse; vdev_t *rvd; uint64_t pool_guid; const char *comment; const char *compatibility; /* * Versioning wasn't explicitly added to the label until later, so if * it's not present treat it as the initial version. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &spa->spa_ubsync.ub_version) != 0) spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { spa_load_failed(spa, "invalid config provided: '%s' missing", ZPOOL_CONFIG_POOL_GUID); return (SET_ERROR(EINVAL)); } /* * If we are doing an import, ensure that the pool is not already * imported by checking if its pool guid already exists in the * spa namespace. * * The only case that we allow an already imported pool to be * imported again, is when the pool is checkpointed and we want to * look at its checkpointed state from userland tools like zdb. */ #ifdef _KERNEL if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) { #else if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0) && !spa_importing_readonly_checkpoint(spa)) { #endif spa_load_failed(spa, "a pool with guid %llu is already open", (u_longlong_t)pool_guid); return (SET_ERROR(EEXIST)); } spa->spa_config_guid = pool_guid; nvlist_free(spa->spa_load_info); spa->spa_load_info = fnvlist_alloc(); ASSERT(spa->spa_comment == NULL); if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) spa->spa_comment = spa_strdup(comment); ASSERT(spa->spa_compatibility == NULL); if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, &compatibility) == 0) spa->spa_compatibility = spa_strdup(compatibility); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &spa->spa_config_txg); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) spa->spa_config_splitting = fnvlist_dup(nvl); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { spa_load_failed(spa, "invalid config provided: '%s' missing", ZPOOL_CONFIG_VDEV_TREE); return (SET_ERROR(EINVAL)); } /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Parse the configuration into a vdev tree. We explicitly set the * value that will be returned by spa_version() since parsing the * configuration requires knowing the version number. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); parse = (type == SPA_IMPORT_EXISTING ? VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "unable to parse config [error=%d]", error); return (error); } ASSERT(spa->spa_root_vdev == rvd); ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); if (type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_guid(spa) == pool_guid); } return (0); } /* * Recursively open all vdevs in the vdev tree. This function is called twice: * first with the untrusted config, then with the trusted config. */ static int spa_ld_open_vdevs(spa_t *spa) { int error = 0; /* * spa_missing_tvds_allowed defines how many top-level vdevs can be * missing/unopenable for the root vdev to be still considered openable. */ if (spa->spa_trust_config) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; } else { spa->spa_missing_tvds_allowed = 0; } spa->spa_missing_tvds_allowed = MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_open(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "vdev tree has %lld missing top-level " "vdevs.", (u_longlong_t)spa->spa_missing_tvds); if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { /* * Although theoretically we could allow users to open * incomplete pools in RW mode, we'd need to add a lot * of extra logic (e.g. adjust pool space to account * for missing vdevs). * This limitation also prevents users from accidentally * opening the pool in RW mode during data recovery and * damaging it further. */ spa_load_note(spa, "pools with missing top-level " "vdevs can only be opened in read-only mode."); error = SET_ERROR(ENXIO); } else { spa_load_note(spa, "current settings allow for maximum " "%lld missing top-level vdevs at this stage.", (u_longlong_t)spa->spa_missing_tvds_allowed); } } if (error != 0) { spa_load_failed(spa, "unable to open vdev tree [error=%d]", error); } if (spa->spa_missing_tvds != 0 || error != 0) vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); return (error); } /* * We need to validate the vdev labels against the configuration that * we have in hand. This function is called twice: first with an untrusted * config, then with a trusted config. The validation is more strict when the * config is trusted. */ static int spa_ld_validate_vdevs(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_validate(rvd); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "vdev_validate failed [error=%d]", error); return (error); } if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { spa_load_failed(spa, "cannot open vdev tree after invalidating " "some vdevs"); vdev_dbgmsg_print_tree(rvd, 2); return (SET_ERROR(ENXIO)); } return (0); } static void spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) { spa->spa_state = POOL_STATE_ACTIVE; spa->spa_ubsync = spa->spa_uberblock; spa->spa_verify_min_txg = spa->spa_extreme_rewind ? TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; spa->spa_first_txg = spa->spa_last_ubsync_txg ? spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; } static int spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) { vdev_t *rvd = spa->spa_root_vdev; nvlist_t *label; uberblock_t *ub = &spa->spa_uberblock; boolean_t activity_check = B_FALSE; /* * If we are opening the checkpointed state of the pool by * rewinding to it, at this point we will have written the * checkpointed uberblock to the vdev labels, so searching * the labels will find the right uberblock. However, if * we are opening the checkpointed state read-only, we have * not modified the labels. Therefore, we must ignore the * labels and continue using the spa_uberblock that was set * by spa_ld_checkpoint_rewind. * * Note that it would be fine to ignore the labels when * rewinding (opening writeable) as well. However, if we * crash just after writing the labels, we will end up * searching the labels. Doing so in the common case means * that this code path gets exercised normally, rather than * just in the edge case. */ if (ub->ub_checkpoint_txg != 0 && spa_importing_readonly_checkpoint(spa)) { spa_ld_select_uberblock_done(spa, ub); return (0); } /* * Find the best uberblock. */ vdev_uberblock_load(rvd, ub, &label); /* * If we weren't able to find a single valid uberblock, return failure. */ if (ub->ub_txg == 0) { nvlist_free(label); spa_load_failed(spa, "no valid uberblock found"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } if (spa->spa_load_max_txg != UINT64_MAX) { (void) spa_import_progress_set_max_txg(spa_guid(spa), (u_longlong_t)spa->spa_load_max_txg); } spa_load_note(spa, "using uberblock with txg=%llu", (u_longlong_t)ub->ub_txg); /* * For pools which have the multihost property on determine if the * pool is truly inactive and can be safely imported. Prevent * hosts which don't have a hostid set from importing the pool. */ activity_check = spa_activity_check_required(spa, ub, label, spa->spa_config); if (activity_check) { if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && spa_get_hostid(spa) == 0) { nvlist_free(label); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } int error = spa_activity_check(spa, ub, spa->spa_config); if (error) { nvlist_free(label); return (error); } fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); fnvlist_add_uint16(spa->spa_load_info, ZPOOL_CONFIG_MMP_SEQ, (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); } /* * If the pool has an unsupported version we can't open it. */ if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { nvlist_free(label); spa_load_failed(spa, "version %llu is not supported", (u_longlong_t)ub->ub_version); return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); } if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *features; /* * If we weren't able to find what's necessary for reading the * MOS in the label, return failure. */ if (label == NULL) { spa_load_failed(spa, "label config unavailable"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { nvlist_free(label); spa_load_failed(spa, "invalid label: '%s' missing", ZPOOL_CONFIG_FEATURES_FOR_READ); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } /* * Update our in-core representation with the definitive values * from the label. */ nvlist_free(spa->spa_label_features); spa->spa_label_features = fnvlist_dup(features); } nvlist_free(label); /* * Look through entries in the label nvlist's features_for_read. If * there is a feature listed there which we don't understand then we * cannot open a pool. */ if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *unsup_feat; unsup_feat = fnvlist_alloc(); for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, NULL); nvp != NULL; nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { if (!zfeature_is_supported(nvpair_name(nvp))) { fnvlist_add_string(unsup_feat, nvpair_name(nvp), ""); } } if (!nvlist_empty(unsup_feat)) { fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); nvlist_free(unsup_feat); spa_load_failed(spa, "some features are unsupported"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } nvlist_free(unsup_feat); } if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_try_repair(spa, spa->spa_config); spa_config_exit(spa, SCL_ALL, FTAG); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; } /* * Initialize internal SPA structures. */ spa_ld_select_uberblock_done(spa, ub); return (0); } static int spa_ld_open_rootbp(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error != 0) { spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; return (0); } static int spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, boolean_t reloading) { vdev_t *mrvd, *rvd = spa->spa_root_vdev; nvlist_t *nv, *mos_config, *policy; int error = 0, copy_error; uint64_t healthy_tvds, healthy_tvds_mos; uint64_t mos_config_txg; if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * If we're assembling a pool from a split, the config provided is * already trusted so there is nothing to do. */ if (type == SPA_IMPORT_ASSEMBLE) return (0); healthy_tvds = spa_healthy_core_tvds(spa); if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { spa_load_failed(spa, "unable to retrieve MOS config"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * If we are doing an open, pool owner wasn't verified yet, thus do * the verification here. */ if (spa->spa_load_state == SPA_LOAD_OPEN) { error = spa_verify_host(spa, mos_config); if (error != 0) { nvlist_free(mos_config); return (error); } } nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Build a new vdev tree from the trusted config */ error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); if (error != 0) { nvlist_free(mos_config); spa_config_exit(spa, SCL_ALL, FTAG); spa_load_failed(spa, "spa_config_parse failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } /* * Vdev paths in the MOS may be obsolete. If the untrusted config was * obtained by scanning /dev/dsk, then it will have the right vdev * paths. We update the trusted MOS config with this information. * We first try to copy the paths with vdev_copy_path_strict, which * succeeds only when both configs have exactly the same vdev tree. * If that fails, we fall back to a more flexible method that has a * best effort policy. */ copy_error = vdev_copy_path_strict(rvd, mrvd); if (copy_error != 0 || spa_load_print_vdev_tree) { spa_load_note(spa, "provided vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); spa_load_note(spa, "MOS vdev tree:"); vdev_dbgmsg_print_tree(mrvd, 2); } if (copy_error != 0) { spa_load_note(spa, "vdev_copy_path_strict failed, falling " "back to vdev_copy_path_relaxed"); vdev_copy_path_relaxed(rvd, mrvd); } vdev_close(rvd); vdev_free(rvd); spa->spa_root_vdev = mrvd; rvd = mrvd; spa_config_exit(spa, SCL_ALL, FTAG); + /* + * If 'zpool import' used a cached config, then the on-disk hostid and + * hostname may be different to the cached config in ways that should + * prevent import. Userspace can't discover this without a scan, but + * we know, so we add these values to LOAD_INFO so the caller can know + * the difference. + * + * Note that we have to do this before the config is regenerated, + * because the new config will have the hostid and hostname for this + * host, in readiness for import. + */ + if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID)) + fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID, + fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID)); + if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME)) + fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME, + fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME)); + /* * We will use spa_config if we decide to reload the spa or if spa_load * fails and we rewind. We must thus regenerate the config using the * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to * pass settings on how to load the pool and is not stored in the MOS. * We copy it over to our new, trusted config. */ mos_config_txg = fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_POOL_TXG); nvlist_free(mos_config); mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, &policy) == 0) fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); spa_config_set(spa, mos_config); spa->spa_config_source = SPA_CONFIG_SRC_MOS; /* * Now that we got the config from the MOS, we should be more strict * in checking blkptrs and can make assumptions about the consistency * of the vdev tree. spa_trust_config must be set to true before opening * vdevs in order for them to be writeable. */ spa->spa_trust_config = B_TRUE; /* * Open and validate the new vdev tree */ error = spa_ld_open_vdevs(spa); if (error != 0) return (error); error = spa_ld_validate_vdevs(spa); if (error != 0) return (error); if (copy_error != 0 || spa_load_print_vdev_tree) { spa_load_note(spa, "final vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); } if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { /* * Sanity check to make sure that we are indeed loading the * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds * in the config provided and they happened to be the only ones * to have the latest uberblock, we could involuntarily perform * an extreme rewind. */ healthy_tvds_mos = spa_healthy_core_tvds(spa); if (healthy_tvds_mos - healthy_tvds >= SPA_SYNC_MIN_VDEVS) { spa_load_note(spa, "config provided misses too many " "top-level vdevs compared to MOS (%lld vs %lld). ", (u_longlong_t)healthy_tvds, (u_longlong_t)healthy_tvds_mos); spa_load_note(spa, "vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); if (reloading) { spa_load_failed(spa, "config was already " "provided from MOS. Aborting."); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_load_note(spa, "spa must be reloaded using MOS " "config"); return (SET_ERROR(EAGAIN)); } } error = spa_check_for_missing_logs(spa); if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { spa_load_failed(spa, "uberblock guid sum doesn't match MOS " "guid sum (%llu != %llu)", (u_longlong_t)spa->spa_uberblock.ub_guid_sum, (u_longlong_t)rvd->vdev_guid_sum); return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); } return (0); } static int spa_ld_open_indirect_vdev_metadata(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * Everything that we read before spa_remove_init() must be stored * on concreted vdevs. Therefore we do this as early as possible. */ error = spa_remove_init(spa); if (error != 0) { spa_load_failed(spa, "spa_remove_init failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Retrieve information needed to condense indirect vdev mappings. */ error = spa_condense_init(spa); if (error != 0) { spa_load_failed(spa, "spa_condense_init failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } return (0); } static int spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; if (spa_version(spa) >= SPA_VERSION_FEATURES) { boolean_t missing_feat_read = B_FALSE; nvlist_t *unsup_feat, *enabled_feat; if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, &spa->spa_feat_for_read_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, &spa->spa_feat_for_write_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, &spa->spa_feat_desc_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } enabled_feat = fnvlist_alloc(); unsup_feat = fnvlist_alloc(); if (!spa_features_check(spa, B_FALSE, unsup_feat, enabled_feat)) missing_feat_read = B_TRUE; if (spa_writeable(spa) || spa->spa_load_state == SPA_LOAD_TRYIMPORT) { if (!spa_features_check(spa, B_TRUE, unsup_feat, enabled_feat)) { *missing_feat_writep = B_TRUE; } } fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); if (!nvlist_empty(unsup_feat)) { fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); } fnvlist_free(enabled_feat); fnvlist_free(unsup_feat); if (!missing_feat_read) { fnvlist_add_boolean(spa->spa_load_info, ZPOOL_CONFIG_CAN_RDONLY); } /* * If the state is SPA_LOAD_TRYIMPORT, our objective is * twofold: to determine whether the pool is available for * import in read-write mode and (if it is not) whether the * pool is available for import in read-only mode. If the pool * is available for import in read-write mode, it is displayed * as available in userland; if it is not available for import * in read-only mode, it is displayed as unavailable in * userland. If the pool is available for import in read-only * mode but not read-write mode, it is displayed as unavailable * in userland with a special note that the pool is actually * available for open in read-only mode. * * As a result, if the state is SPA_LOAD_TRYIMPORT and we are * missing a feature for write, we must first determine whether * the pool can be opened read-only before returning to * userland in order to know whether to display the * abovementioned note. */ if (missing_feat_read || (*missing_feat_writep && spa_writeable(spa))) { spa_load_failed(spa, "pool uses unsupported features"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * Load refcounts for ZFS features from disk into an in-memory * cache during SPA initialization. */ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { uint64_t refcount; error = feature_get_refcount_from_disk(spa, &spa_feature_table[i], &refcount); if (error == 0) { spa->spa_feat_refcount_cache[i] = refcount; } else if (error == ENOTSUP) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } else { spa_load_failed(spa, "error getting refcount " "for feature %s [error=%d]", spa_feature_table[i].fi_guid, error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } } } if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Encryption was added before bookmark_v2, even though bookmark_v2 * is now a dependency. If this pool has encryption enabled without * bookmark_v2, trigger an errata message. */ if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; } return (0); } static int spa_ld_load_special_directories(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; spa->spa_is_initializing = B_TRUE; error = dsl_pool_open(spa->spa_dsl_pool); spa->spa_is_initializing = B_FALSE; if (error != 0) { spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_get_props(spa_t *spa) { int error = 0; uint64_t obj; vdev_t *rvd = spa->spa_root_vdev; /* Grab the checksum salt from the MOS. */ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes); if (error == ENOENT) { /* Generate a new salt for subsequent use */ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); } else if (error != 0) { spa_load_failed(spa, "unable to retrieve checksum salt from " "MOS [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); if (error != 0) { spa_load_failed(spa, "error opening deferred-frees bpobj " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Load the bit that tells us to use the new accounting function * (raid-z deflation). If we have an older pool, this will not * be present. */ error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, &spa->spa_creation_version, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the persistent error log. If we have an older pool, this will * not be present. */ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, &spa->spa_errlog_scrub, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the livelist deletion field. If a livelist is queued for * deletion, indicate that in the spa */ error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, &spa->spa_livelists_to_delete, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the history object. If we have an older pool, this * will not be present. */ error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the per-vdev ZAP map. If we have an older pool, this will not * be present; in this case, defer its creation to a later time to * avoid dirtying the MOS this early / out of sync context. See * spa_sync_config_object. */ /* The sentinel is only available in the MOS config. */ nvlist_t *mos_config; if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { spa_load_failed(spa, "unable to retrieve MOS config"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, &spa->spa_all_vdev_zaps, B_FALSE); if (error == ENOENT) { VERIFY(!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); spa->spa_avz_action = AVZ_ACTION_INITIALIZE; ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } else if (error != 0) { nvlist_free(mos_config); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { /* * An older version of ZFS overwrote the sentinel value, so * we have orphaned per-vdev ZAPs in the MOS. Defer their * destruction to later; see spa_sync_config_object. */ spa->spa_avz_action = AVZ_ACTION_DESTROY; /* * We're assuming that no vdevs have had their ZAPs created * before this. Better be sure of it. */ ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } nvlist_free(mos_config); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, B_FALSE); if (error && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0) { uint64_t autoreplace = 0; spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); spa->spa_autoreplace = (autoreplace != 0); } /* * If we are importing a pool with missing top-level vdevs, * we enforce that the pool doesn't panic or get suspended on * error since the likelihood of missing data is extremely high. */ if (spa->spa_missing_tvds > 0 && spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { spa_load_note(spa, "forcing failmode to 'continue' " "as some top level vdevs are missing"); spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; } return (0); } static int spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * If we're assembling the pool from the split-off vdevs of * an existing pool, we don't want to attach the spares & cache * devices. */ /* * Load any hot spares for this pool. */ error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); if (load_nvlist(spa, spa->spa_spares.sav_object, &spa->spa_spares.sav_config) != 0) { spa_load_failed(spa, "error loading spares nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_spares.sav_sync = B_TRUE; } /* * Load any level 2 ARC devices for this pool. */ error = spa_dir_prop(spa, DMU_POOL_L2CACHE, &spa->spa_l2cache.sav_object, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); if (load_nvlist(spa, spa->spa_l2cache.sav_object, &spa->spa_l2cache.sav_config) != 0) { spa_load_failed(spa, "error loading l2cache nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_l2cache.sav_sync = B_TRUE; } return (0); } static int spa_ld_load_vdev_metadata(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * If the 'multihost' property is set, then never allow a pool to * be imported when the system hostid is zero. The exception to * this rule is zdb which is always allowed to access pools. */ if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } /* * If the 'autoreplace' property is set, then post a resource notifying * the ZFS DE that it should not issue any faults for unopenable * devices. We also iterate over the vdevs, and post a sysevent for any * unopenable vdevs so that the normal autoreplace handler can take * over. */ if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { spa_check_removed(spa->spa_root_vdev); /* * For the import case, this is done in spa_import(), because * at this point we're using the spare definitions from * the MOS config, not necessarily from the userland config. */ if (spa->spa_load_state != SPA_LOAD_IMPORT) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } } /* * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. */ error = vdev_load(rvd); if (error != 0) { spa_load_failed(spa, "vdev_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } error = spa_ld_log_spacemaps(spa); if (error != 0) { spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } /* * Propagate the leaf DTLs we just loaded all the way up the vdev tree. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); spa_config_exit(spa, SCL_ALL, FTAG); return (0); } static int spa_ld_load_dedup_tables(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = ddt_load(spa); if (error != 0) { spa_load_failed(spa, "ddt_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_load_brt(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = brt_load(spa); if (error != 0) { spa_load_failed(spa, "brt_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport) { vdev_t *rvd = spa->spa_root_vdev; if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { boolean_t missing = spa_check_logs(spa); if (missing) { if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "spa_check_logs failed " "so dropping the logs"); } else { *ereport = FM_EREPORT_ZFS_LOG_REPLAY; spa_load_failed(spa, "spa_check_logs failed"); return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); } } } return (0); } static int spa_ld_verify_pool_data(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * We've successfully opened the pool, verify that we're ready * to start pushing transactions. */ if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { error = spa_load_verify(spa); if (error != 0) { spa_load_failed(spa, "spa_load_verify failed " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } } return (0); } static void spa_ld_claim_log_blocks(spa_t *spa) { dmu_tx_t *tx; dsl_pool_t *dp = spa_get_dsl(spa); /* * Claim log blocks that haven't been committed yet. * This must all happen in a single txg. * Note: spa_claim_max_txg is updated by spa_claim_notify(), * invoked from zil_claim_log_block()'s i/o done callback. * Price of rollback is that we abandon the log. */ spa->spa_claiming = B_TRUE; tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); spa->spa_claiming = B_FALSE; spa_set_log_state(spa, SPA_LOG_GOOD); } static void spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, boolean_t update_config_cache) { vdev_t *rvd = spa->spa_root_vdev; int need_update = B_FALSE; /* * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. * * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ if (update_config_cache || config_cache_txg != spa->spa_config_txg || spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_RECOVER || (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) need_update = B_TRUE; for (int c = 0; c < rvd->vdev_children; c++) if (rvd->vdev_child[c]->vdev_ms_array == 0) need_update = B_TRUE; /* * Update the config cache asynchronously in case we're the * root pool, in which case the config cache isn't writable yet. */ if (need_update) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } static void spa_ld_prepare_for_reload(spa_t *spa) { spa_mode_t mode = spa->spa_mode; int async_suspended = spa->spa_async_suspended; spa_unload(spa); spa_deactivate(spa); spa_activate(spa, mode); /* * We save the value of spa_async_suspended as it gets reset to 0 by * spa_unload(). We want to restore it back to the original value before * returning as we might be calling spa_async_resume() later. */ spa->spa_async_suspended = async_suspended; } static int spa_ld_read_checkpoint_txg(spa_t *spa) { uberblock_t checkpoint; int error = 0; ASSERT0(spa->spa_checkpoint_txg); ASSERT(MUTEX_HELD(&spa_namespace_lock)); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error == ENOENT) return (0); if (error != 0) return (error); ASSERT3U(checkpoint.ub_txg, !=, 0); ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); ASSERT3U(checkpoint.ub_timestamp, !=, 0); spa->spa_checkpoint_txg = checkpoint.ub_txg; spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; return (0); } static int spa_ld_mos_init(spa_t *spa, spa_import_type_t type) { int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); /* * Never trust the config that is provided unless we are assembling * a pool following a split. * This means don't trust blkptrs and the vdev tree in general. This * also effectively puts the spa in read-only mode since * spa_writeable() checks for spa_trust_config to be true. * We will later load a trusted config from the MOS. */ if (type != SPA_IMPORT_ASSEMBLE) spa->spa_trust_config = B_FALSE; /* * Parse the config provided to create a vdev tree. */ error = spa_ld_parse_config(spa, type); if (error != 0) return (error); spa_import_progress_add(spa); /* * Now that we have the vdev tree, try to open each vdev. This involves * opening the underlying physical device, retrieving its geometry and * probing the vdev with a dummy I/O. The state of each vdev will be set * based on the success of those operations. After this we'll be ready * to read from the vdevs. */ error = spa_ld_open_vdevs(spa); if (error != 0) return (error); /* * Read the label of each vdev and make sure that the GUIDs stored * there match the GUIDs in the config provided. * If we're assembling a new pool that's been split off from an * existing pool, the labels haven't yet been updated so we skip * validation for now. */ if (type != SPA_IMPORT_ASSEMBLE) { error = spa_ld_validate_vdevs(spa); if (error != 0) return (error); } /* * Read all vdev labels to find the best uberblock (i.e. latest, * unless spa_load_max_txg is set) and store it in spa_uberblock. We * get the list of features required to read blkptrs in the MOS from * the vdev label with the best uberblock and verify that our version * of zfs supports them all. */ error = spa_ld_select_uberblock(spa, type); if (error != 0) return (error); /* * Pass that uberblock to the dsl_pool layer which will open the root * blkptr. This blkptr points to the latest version of the MOS and will * allow us to read its contents. */ error = spa_ld_open_rootbp(spa); if (error != 0) return (error); return (0); } static int spa_ld_checkpoint_rewind(spa_t *spa) { uberblock_t checkpoint; int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error != 0) { spa_load_failed(spa, "unable to retrieve checkpointed " "uberblock from the MOS config [error=%d]", error); if (error == ENOENT) error = ZFS_ERR_NO_CHECKPOINT; return (error); } ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); /* * We need to update the txg and timestamp of the checkpointed * uberblock to be higher than the latest one. This ensures that * the checkpointed uberblock is selected if we were to close and * reopen the pool right after we've written it in the vdev labels. * (also see block comment in vdev_uberblock_compare) */ checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; checkpoint.ub_timestamp = gethrestime_sec(); /* * Set current uberblock to be the checkpointed uberblock. */ spa->spa_uberblock = checkpoint; /* * If we are doing a normal rewind, then the pool is open for * writing and we sync the "updated" checkpointed uberblock to * disk. Once this is done, we've basically rewound the whole * pool and there is no way back. * * There are cases when we don't want to attempt and sync the * checkpointed uberblock to disk because we are opening a * pool as read-only. Specifically, verifying the checkpointed * state with zdb, and importing the checkpointed state to get * a "preview" of its content. */ if (spa_writeable(spa)) { vdev_t *rvd = spa->spa_root_vdev; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; int c0 = random_in_range(children); for (int c = 0; c < children; c++) { vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; /* Stop when revisiting the first vdev */ if (c > 0 && svd[0] == vd) break; if (vd->vdev_ms_array == 0 || vd->vdev_islog || !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_SYNC_MIN_VDEVS) break; } error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "failed to write checkpointed " "uberblock to the vdev labels [error=%d]", error); return (error); } } return (0); } static int spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, boolean_t *update_config_cache) { int error; /* * Parse the config for pool, open and validate vdevs, * select an uberblock, and use that uberblock to open * the MOS. */ error = spa_ld_mos_init(spa, type); if (error != 0) return (error); /* * Retrieve the trusted config stored in the MOS and use it to create * a new, exact version of the vdev tree, then reopen all vdevs. */ error = spa_ld_trusted_config(spa, type, B_FALSE); if (error == EAGAIN) { if (update_config_cache != NULL) *update_config_cache = B_TRUE; /* * Redo the loading process with the trusted config if it is * too different from the untrusted config. */ spa_ld_prepare_for_reload(spa); spa_load_note(spa, "RELOADING"); error = spa_ld_mos_init(spa, type); if (error != 0) return (error); error = spa_ld_trusted_config(spa, type, B_TRUE); if (error != 0) return (error); } else if (error != 0) { return (error); } return (0); } /* * Load an existing storage pool, using the config provided. This config * describes which vdevs are part of the pool and is later validated against * partial configs present in each vdev's label and an entire copy of the * config stored in the MOS. */ static int spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) { int error = 0; boolean_t missing_feat_write = B_FALSE; boolean_t checkpoint_rewind = (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); boolean_t update_config_cache = B_FALSE; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); spa_load_note(spa, "LOADING"); error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); if (error != 0) return (error); /* * If we are rewinding to the checkpoint then we need to repeat * everything we've done so far in this function but this time * selecting the checkpointed uberblock and using that to open * the MOS. */ if (checkpoint_rewind) { /* * If we are rewinding to the checkpoint update config cache * anyway. */ update_config_cache = B_TRUE; /* * Extract the checkpointed uberblock from the current MOS * and use this as the pool's uberblock from now on. If the * pool is imported as writeable we also write the checkpoint * uberblock to the labels, making the rewind permanent. */ error = spa_ld_checkpoint_rewind(spa); if (error != 0) return (error); /* * Redo the loading process again with the * checkpointed uberblock. */ spa_ld_prepare_for_reload(spa); spa_load_note(spa, "LOADING checkpointed uberblock"); error = spa_ld_mos_with_trusted_config(spa, type, NULL); if (error != 0) return (error); } /* * Retrieve the checkpoint txg if the pool has a checkpoint. */ error = spa_ld_read_checkpoint_txg(spa); if (error != 0) return (error); /* * Retrieve the mapping of indirect vdevs. Those vdevs were removed * from the pool and their contents were re-mapped to other vdevs. Note * that everything that we read before this step must have been * rewritten on concrete vdevs after the last device removal was * initiated. Otherwise we could be reading from indirect vdevs before * we have loaded their mappings. */ error = spa_ld_open_indirect_vdev_metadata(spa); if (error != 0) return (error); /* * Retrieve the full list of active features from the MOS and check if * they are all supported. */ error = spa_ld_check_features(spa, &missing_feat_write); if (error != 0) return (error); /* * Load several special directories from the MOS needed by the dsl_pool * layer. */ error = spa_ld_load_special_directories(spa); if (error != 0) return (error); /* * Retrieve pool properties from the MOS. */ error = spa_ld_get_props(spa); if (error != 0) return (error); /* * Retrieve the list of auxiliary devices - cache devices and spares - * and open them. */ error = spa_ld_open_aux_vdevs(spa, type); if (error != 0) return (error); /* * Load the metadata for all vdevs. Also check if unopenable devices * should be autoreplaced. */ error = spa_ld_load_vdev_metadata(spa); if (error != 0) return (error); error = spa_ld_load_dedup_tables(spa); if (error != 0) return (error); error = spa_ld_load_brt(spa); if (error != 0) return (error); /* * Verify the logs now to make sure we don't have any unexpected errors * when we claim log blocks later. */ error = spa_ld_verify_logs(spa, type, ereport); if (error != 0) return (error); if (missing_feat_write) { ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); /* * At this point, we know that we can open the pool in * read-only mode but not read-write mode. We now have enough * information and can return to userland. */ return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * Traverse the last txgs to make sure the pool was left off in a safe * state. When performing an extreme rewind, we verify the whole pool, * which can take a very long time. */ error = spa_ld_verify_pool_data(spa); if (error != 0) return (error); /* * Calculate the deflated space for the pool. This must be done before * we write anything to the pool because we'd need to update the space * accounting using the deflated sizes. */ spa_update_dspace(spa); /* * We have now retrieved all the information we needed to open the * pool. If we are importing the pool in read-write mode, a few * additional steps must be performed to finish the import. */ if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { uint64_t config_cache_txg = spa->spa_config_txg; ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); /* * In case of a checkpoint rewind, log the original txg * of the checkpointed uberblock. */ if (checkpoint_rewind) { spa_history_log_internal(spa, "checkpoint rewind", NULL, "rewound state to txg=%llu", (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); } /* * Traverse the ZIL and claim all blocks. */ spa_ld_claim_log_blocks(spa); /* * Kick-off the syncing thread. */ spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); mmp_thread_start(spa); /* * Wait for all claims to sync. We sync up to the highest * claimed log block birth time so that claimed log blocks * don't appear to be from the future. spa_claim_max_txg * will have been set for us by ZIL traversal operations * performed above. */ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* * Check if we need to request an update of the config. On the * next sync, we would update the config stored in vdev labels * and the cachefile (by default /etc/zfs/zpool.cache). */ spa_ld_check_for_config_update(spa, config_cache_txg, update_config_cache); /* * Check if a rebuild was in progress and if so resume it. * Then check all DTLs to see if anything needs resilvering. * The resilver will be deferred if a rebuild was started. */ if (vdev_rebuild_active(spa->spa_root_vdev)) { vdev_rebuild_restart(spa); } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER); } /* * Log the fact that we booted up (so that we can detect if * we rebooted in the middle of an operation). */ spa_history_log_version(spa, "open", NULL); spa_restart_removal(spa); spa_spawn_aux_threads(spa); /* * Delete any inconsistent datasets. * * Note: * Since we may be issuing deletes for clones here, * we make sure to do so after we've spawned all the * auxiliary threads above (from which the livelist * deletion zthr is part of). */ (void) dmu_objset_find(spa_name(spa), dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); /* * Clean up any stale temporary dataset userrefs. */ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_initialize_restart(spa->spa_root_vdev); vdev_trim_restart(spa->spa_root_vdev); vdev_autotrim_restart(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_import_progress_remove(spa_guid(spa)); spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_load_note(spa, "LOADED"); return (0); } static int spa_load_retry(spa_t *spa, spa_load_state_t state) { spa_mode_t mode = spa->spa_mode; spa_unload(spa); spa_deactivate(spa); spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; spa_activate(spa, mode); spa_async_suspend(spa); spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", (u_longlong_t)spa->spa_load_max_txg); return (spa_load(spa, state, SPA_IMPORT_EXISTING)); } /* * If spa_load() fails this function will try loading prior txg's. If * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this * function will not rewind the pool and will return the same error as * spa_load(). */ static int spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, int rewind_flags) { nvlist_t *loadinfo = NULL; nvlist_t *config = NULL; int load_error, rewind_error; uint64_t safe_rewind_txg; uint64_t min_txg; if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { spa->spa_load_max_txg = spa->spa_load_txg; spa_set_log_state(spa, SPA_LOG_CLEAR); } else { spa->spa_load_max_txg = max_request; if (max_request != UINT64_MAX) spa->spa_extreme_rewind = B_TRUE; } load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); if (load_error == 0) return (0); if (load_error == ZFS_ERR_NO_CHECKPOINT) { /* * When attempting checkpoint-rewind on a pool with no * checkpoint, we should not attempt to load uberblocks * from previous txgs when spa_load fails. */ ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); spa_import_progress_remove(spa_guid(spa)); return (load_error); } if (spa->spa_root_vdev != NULL) config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; if (rewind_flags & ZPOOL_NEVER_REWIND) { nvlist_free(config); spa_import_progress_remove(spa_guid(spa)); return (load_error); } if (state == SPA_LOAD_RECOVER) { /* Price of rolling back is discarding txgs, including log */ spa_set_log_state(spa, SPA_LOG_CLEAR); } else { /* * If we aren't rolling back save the load info from our first * import attempt so that we can restore it after attempting * to rewind. */ loadinfo = spa->spa_load_info; spa->spa_load_info = fnvlist_alloc(); } spa->spa_load_max_txg = spa->spa_last_ubsync_txg; safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? TXG_INITIAL : safe_rewind_txg; /* * Continue as long as we're finding errors, we're still within * the acceptable rewind range, and we're still finding uberblocks */ while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { if (spa->spa_load_max_txg < safe_rewind_txg) spa->spa_extreme_rewind = B_TRUE; rewind_error = spa_load_retry(spa, state); } spa->spa_extreme_rewind = B_FALSE; spa->spa_load_max_txg = UINT64_MAX; if (config && (rewind_error || state != SPA_LOAD_RECOVER)) spa_config_set(spa, config); else nvlist_free(config); if (state == SPA_LOAD_RECOVER) { ASSERT3P(loadinfo, ==, NULL); spa_import_progress_remove(spa_guid(spa)); return (rewind_error); } else { /* Store the rewind info as part of the initial load info */ fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, spa->spa_load_info); /* Restore the initial load info */ fnvlist_free(spa->spa_load_info); spa->spa_load_info = loadinfo; spa_import_progress_remove(spa_guid(spa)); return (load_error); } } /* * Pool Open/Import * * The import case is identical to an open except that the configuration is sent * down from userland, instead of grabbed from the configuration cache. For the * case of an open, the pool configuration will exist in the * POOL_STATE_UNINITIALIZED state. * * The stats information (gen/count/ustats) is used to gather vdev statistics at * the same time open the pool, without having to keep around the spa_t in some * ambiguous state. */ static int spa_open_common(const char *pool, spa_t **spapp, const void *tag, nvlist_t *nvpolicy, nvlist_t **config) { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; int firstopen = B_FALSE; *spapp = NULL; /* * As disgusting as this is, we need to support recursive calls to this * function because dsl_dir_open() is called during spa_load(), and ends * up calling spa_open() again. The real fix is to figure out how to * avoid dsl_dir_open() calling this in the first place. */ if (MUTEX_NOT_HELD(&spa_namespace_lock)) { mutex_enter(&spa_namespace_lock); locked = B_TRUE; } if ((spa = spa_lookup(pool)) == NULL) { if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (spa->spa_state == POOL_STATE_UNINITIALIZED) { zpool_load_policy_t policy; firstopen = B_TRUE; zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, &policy); if (policy.zlp_rewind & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa_activate(spa, spa_mode_global); if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; zfs_dbgmsg("spa_open_common: opening %s", pool); error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); if (error == EBADF) { /* * If vdev_validate() returns failure (indicated by * EBADF), it indicates that one of the vdevs indicates * that the pool has been exported or destroyed. If * this is the case, the config cache is out of sync and * we should remove the pool from the namespace. */ spa_unload(spa); spa_deactivate(spa); spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (error) { /* * We can't open the pool, but we still have useful * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ if (config != NULL && spa->spa_config) { *config = fnvlist_dup(spa->spa_config); fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); } spa_unload(spa); spa_deactivate(spa); spa->spa_last_open_failed = error; if (locked) mutex_exit(&spa_namespace_lock); *spapp = NULL; return (error); } } spa_open_ref(spa, tag); if (config != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); /* * If we've recovered the pool, pass back any information we * gathered while doing the load. */ if (state == SPA_LOAD_RECOVER && config != NULL) { fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); } if (locked) { spa->spa_last_open_failed = 0; spa->spa_last_ubsync_txg = 0; spa->spa_load_txg = 0; mutex_exit(&spa_namespace_lock); } if (firstopen) zvol_create_minors_recursive(spa_name(spa)); *spapp = spa; return (0); } int spa_open_rewind(const char *name, spa_t **spapp, const void *tag, nvlist_t *policy, nvlist_t **config) { return (spa_open_common(name, spapp, tag, policy, config)); } int spa_open(const char *name, spa_t **spapp, const void *tag) { return (spa_open_common(name, spapp, tag, NULL, NULL)); } /* * Lookup the given spa_t, incrementing the inject count in the process, * preventing it from being exported or destroyed. */ spa_t * spa_inject_addref(char *name) { spa_t *spa; mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(name)) == NULL) { mutex_exit(&spa_namespace_lock); return (NULL); } spa->spa_inject_ref++; mutex_exit(&spa_namespace_lock); return (spa); } void spa_inject_delref(spa_t *spa) { mutex_enter(&spa_namespace_lock); spa->spa_inject_ref--; mutex_exit(&spa_namespace_lock); } /* * Add spares device information to the nvlist. */ static void spa_add_spares(spa_t *spa, nvlist_t *config) { nvlist_t **spares; uint_t i, nspares; nvlist_t *nvroot; uint64_t guid; vdev_stat_t *vs; uint_t vsc; uint64_t pool; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_spares.sav_count == 0) return; nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares)); if (nspares != 0) { fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, nspares); VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares)); /* * Go through and find any spares which have since been * repurposed as an active spare. If this is the case, update * their status appropriately. */ for (i = 0; i < nspares; i++) { guid = fnvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID); VERIFY0(nvlist_lookup_uint64_array(spares[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; } else { vs->vs_state = spa->spa_spares.sav_vdevs[i]->vdev_state; } } } } /* * Add l2cache device information to the nvlist, including vdev stats. */ static void spa_add_l2cache(spa_t *spa, nvlist_t *config) { nvlist_t **l2cache; uint_t i, j, nl2cache; nvlist_t *nvroot; uint64_t guid; vdev_t *vd; vdev_stat_t *vs; uint_t vsc; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_l2cache.sav_count == 0) return; nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); if (nl2cache != 0) { fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, nl2cache); VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); /* * Update level 2 cache device stats. */ for (i = 0; i < nl2cache; i++) { guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); vd = NULL; for (j = 0; j < spa->spa_l2cache.sav_count; j++) { if (guid == spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { vd = spa->spa_l2cache.sav_vdevs[j]; break; } } ASSERT(vd != NULL); VERIFY0(nvlist_lookup_uint64_array(l2cache[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); vdev_get_stats(vd, vs); vdev_config_generate_stats(vd, l2cache[i]); } } } static void spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) { zap_cursor_t zc; zap_attribute_t za; if (spa->spa_feat_for_read_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_read_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); VERIFY0(nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } if (spa->spa_feat_for_write_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_write_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); VERIFY0(nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } } static void spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) { int i; for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t feature = spa_feature_table[i]; uint64_t refcount; if (feature_get_refcount(spa, &feature, &refcount) != 0) continue; VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); } } /* * Store a list of pool features and their reference counts in the * config. * * The first time this is called on a spa, allocate a new nvlist, fetch * the pool features and reference counts from disk, then save the list * in the spa. In subsequent calls on the same spa use the saved nvlist * and refresh its values from the cached reference counts. This * ensures we don't block here on I/O on a suspended pool so 'zpool * clear' can resume the pool. */ static void spa_add_feature_stats(spa_t *spa, nvlist_t *config) { nvlist_t *features; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); mutex_enter(&spa->spa_feat_stats_lock); features = spa->spa_feat_stats; if (features != NULL) { spa_feature_stats_from_cache(spa, features); } else { VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); spa->spa_feat_stats = features; spa_feature_stats_from_disk(spa, features); } VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, features)); mutex_exit(&spa->spa_feat_stats_lock); } int spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) { int error; spa_t *spa; *config = NULL; error = spa_open_common(name, &spa, FTAG, NULL, config); if (spa != NULL) { /* * This still leaves a window of inconsistency where the spares * or l2cache devices could change and the config would be * self-inconsistent. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if (*config != NULL) { uint64_t loadtimes[2]; loadtimes[0] = spa->spa_loaded_ts.tv_sec; loadtimes[1] = spa->spa_loaded_ts.tv_nsec; fnvlist_add_uint64_array(*config, ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2); fnvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, spa_approx_errlog_size(spa)); if (spa_suspended(spa)) { fnvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode); fnvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED_REASON, spa->spa_suspended); } spa_add_spares(spa, *config); spa_add_l2cache(spa, *config); spa_add_feature_stats(spa, *config); } } /* * We want to get the alternate root even for faulted pools, so we cheat * and call spa_lookup() directly. */ if (altroot) { if (spa == NULL) { mutex_enter(&spa_namespace_lock); spa = spa_lookup(name); if (spa) spa_altroot(spa, altroot, buflen); else altroot[0] = '\0'; spa = NULL; mutex_exit(&spa_namespace_lock); } else { spa_altroot(spa, altroot, buflen); } } if (spa != NULL) { spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); } return (error); } /* * Validate that the auxiliary device array is well formed. We must have an * array of nvlists, each which describes a valid leaf vdev. If this is an * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be * specified, as long as they are well-formed. */ static int spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, spa_aux_vdev_t *sav, const char *config, uint64_t version, vdev_labeltype_t label) { nvlist_t **dev; uint_t i, ndev; vdev_t *vd; int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * It's acceptable to have no devs specified. */ if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) return (0); if (ndev == 0) return (SET_ERROR(EINVAL)); /* * Make sure the pool is formatted with a version that supports this * device type. */ if (spa_version(spa) < version) return (SET_ERROR(ENOTSUP)); /* * Set the pending device list so we correctly handle device in-use * checking. */ sav->sav_pending = dev; sav->sav_npending = ndev; for (i = 0; i < ndev; i++) { if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, mode)) != 0) goto out; if (!vd->vdev_ops->vdev_op_leaf) { vdev_free(vd); error = SET_ERROR(EINVAL); goto out; } vd->vdev_top = vd; if ((error = vdev_open(vd)) == 0 && (error = vdev_label_init(vd, crtxg, label)) == 0) { fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, vd->vdev_guid); } vdev_free(vd); if (error && (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) goto out; else error = 0; } out: sav->sav_pending = NULL; sav->sav_npending = 0; return (error); } static int spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) { int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, VDEV_LABEL_SPARE)) != 0) { return (error); } return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, VDEV_LABEL_L2CACHE)); } static void spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, const char *config) { int i; if (sav->sav_config != NULL) { nvlist_t **olddevs; uint_t oldndevs; nvlist_t **newdevs; /* * Generate new dev list by concatenating with the * current dev list. */ VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config, &olddevs, &oldndevs)); newdevs = kmem_alloc(sizeof (void *) * (ndevs + oldndevs), KM_SLEEP); for (i = 0; i < oldndevs; i++) newdevs[i] = fnvlist_dup(olddevs[i]); for (i = 0; i < ndevs; i++) newdevs[i + oldndevs] = fnvlist_dup(devs[i]); fnvlist_remove(sav->sav_config, config); fnvlist_add_nvlist_array(sav->sav_config, config, (const nvlist_t * const *)newdevs, ndevs + oldndevs); for (i = 0; i < oldndevs + ndevs; i++) nvlist_free(newdevs[i]); kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); } else { /* * Generate a new dev list. */ sav->sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(sav->sav_config, config, (const nvlist_t * const *)devs, ndevs); } } /* * Stop and drop level 2 ARC devices */ void spa_l2cache_drop(spa_t *spa) { vdev_t *vd; int i; spa_aux_vdev_t *sav = &spa->spa_l2cache; for (i = 0; i < sav->sav_count; i++) { uint64_t pool; vd = sav->sav_vdevs[i]; ASSERT(vd != NULL); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); } } /* * Verify encryption parameters for spa creation. If we are encrypting, we must * have the encryption feature flag enabled. */ static int spa_create_check_encryption_params(dsl_crypto_params_t *dcp, boolean_t has_encryption) { if (dcp->cp_crypt != ZIO_CRYPT_OFF && dcp->cp_crypt != ZIO_CRYPT_INHERIT && !has_encryption) return (SET_ERROR(ENOTSUP)); return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); } /* * Pool Creation */ int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops, dsl_crypto_params_t *dcp) { spa_t *spa; const char *altroot = NULL; vdev_t *rvd; dsl_pool_t *dp; dmu_tx_t *tx; int error = 0; uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; uint64_t version, obj, ndraid = 0; boolean_t has_features; boolean_t has_encryption; boolean_t has_allocclass; spa_feature_t feat; const char *feat_name; const char *poolname; nvlist_t *nvl; if (props == NULL || nvlist_lookup_string(props, "tname", &poolname) != 0) poolname = (char *)pool; /* * If this pool already exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(poolname) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Allocate a new spa_t structure. */ nvl = fnvlist_alloc(); fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(poolname, nvl, altroot); fnvlist_free(nvl); spa_activate(spa, spa_mode_global); if (props && (error = spa_prop_validate(spa, props))) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Temporary pool names should never be written to disk. */ if (poolname != pool) spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; has_features = B_FALSE; has_encryption = B_FALSE; has_allocclass = B_FALSE; for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); elem != NULL; elem = nvlist_next_nvpair(props, elem)) { if (zpool_prop_feature(nvpair_name(elem))) { has_features = B_TRUE; feat_name = strchr(nvpair_name(elem), '@') + 1; VERIFY0(zfeature_lookup_name(feat_name, &feat)); if (feat == SPA_FEATURE_ENCRYPTION) has_encryption = B_TRUE; if (feat == SPA_FEATURE_ALLOCATION_CLASSES) has_allocclass = B_TRUE; } } /* verify encryption params, if they were provided */ if (dcp != NULL) { error = spa_create_check_encryption_params(dcp, has_encryption); if (error != 0) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } } if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (ENOTSUP); } if (has_features || nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { version = SPA_VERSION; } ASSERT(SPA_VERSION_IS_SUPPORTED(version)); spa->spa_first_txg = txg; spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; spa->spa_load_state = SPA_LOAD_CREATE; spa->spa_removing_phys.sr_state = DSS_NONE; spa->spa_removing_phys.sr_removing_vdev = -1; spa->spa_removing_phys.sr_prev_indirect_vdev = -1; spa->spa_indirect_vdevs_loaded = B_TRUE; /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Create the root vdev. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); ASSERT(error != 0 || rvd != NULL); ASSERT(error != 0 || spa->spa_root_vdev == rvd); if (error == 0 && !zfs_allocatable_devs(nvroot)) error = SET_ERROR(EINVAL); if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { /* * instantiate the metaslab groups (this will dirty the vdevs) * we can no longer error exit past this point */ for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_metaslab_set_size(vd); vdev_expand(vd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Get the list of spares, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { spa->spa_spares.sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, nspares); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } /* * Get the list of level 2 cache devices, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP)); fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, nl2cache); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } spa->spa_is_initializing = B_TRUE; spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); spa->spa_is_initializing = B_FALSE; /* * Create DDTs (dedup tables). */ ddt_create(spa); /* * Create BRT table and BRT table object. */ brt_create(spa); spa_update_dspace(spa); tx = dmu_tx_create_assigned(dp, txg); /* * Create the pool's history object. */ if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) spa_history_create_obj(spa, tx); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); spa_history_log_version(spa, "create", tx); /* * Create the pool config object. */ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool config"); } if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, sizeof (uint64_t), 1, &version, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool version"); } /* Newly created pools with the right version are always deflated. */ if (version >= SPA_VERSION_RAIDZ_DEFLATE) { spa->spa_deflate = TRUE; if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { cmn_err(CE_PANIC, "failed to add deflate"); } } /* * Create the deferred-free bpobj. Turn off compression * because sync-to-convergence takes longer if the blocksize * keeps changing. */ obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); dmu_object_set_compress(spa->spa_meta_objset, obj, ZIO_COMPRESS_OFF, tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, sizeof (uint64_t), 1, &obj, tx) != 0) { cmn_err(CE_PANIC, "failed to add bpobj"); } VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj)); /* * Generate some random noise for salted checksums to operate on. */ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); /* * Set pool properties. */ spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_sync_props(props, tx); } for (int i = 0; i < ndraid; i++) spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; txg_sync_start(dp); mmp_thread_start(spa); txg_wait_synced(dp, txg); spa_spawn_aux_threads(spa); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); spa->spa_load_state = SPA_LOAD_NONE; spa_import_os(spa); mutex_exit(&spa_namespace_lock); return (0); } /* * Import a non-root pool into the system. */ int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; const char *altroot = NULL; spa_load_state_t state = SPA_LOAD_IMPORT; zpool_load_policy_t policy; spa_mode_t mode = spa_mode_global; uint64_t readonly = B_FALSE; int error; nvlist_t *nvroot; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; /* * If a pool with this name exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Create and initialize the spa structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); if (readonly) mode = SPA_MODE_READ; spa = spa_add(pool, config, altroot); spa->spa_import_flags = flags; /* * Verbatim import - Take a pool and insert it into the namespace * as if it had been loaded at boot. */ if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { if (props != NULL) spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); zfs_dbgmsg("spa_import: verbatim import of %s", pool); mutex_exit(&spa_namespace_lock); return (0); } spa_activate(spa, mode); /* * Don't start async tasks until we know everything is healthy. */ spa_async_suspend(spa); zpool_get_load_policy(config, &policy); if (policy.zlp_rewind & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; if (state != SPA_LOAD_RECOVER) { spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; zfs_dbgmsg("spa_import: importing %s", pool); } else { zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); } error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); /* * Propagate anything learned while loading the pool and pass it * back to caller (i.e. rewind info, missing devices, etc). */ fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Toss any existing sparelist, as it doesn't have any validity * anymore, and conflicts with spa_has_spare(). */ if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; spa_load_spares(spa); } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; spa_load_l2cache(spa); } nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); spa_config_exit(spa, SCL_ALL, FTAG); if (props != NULL) spa_configfile_set(spa, props, B_FALSE); if (error != 0 || (props && spa_writeable(spa) && (error = spa_prop_set(spa, props)))) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } spa_async_resume(spa); /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { if (spa->spa_spares.sav_config) fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); else spa->spa_spares.sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, nspares); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { if (spa->spa_l2cache.sav_config) fnvlist_remove(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE); else spa->spa_l2cache.sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, nl2cache); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } /* * Check for any removed devices. */ if (spa->spa_autoreplace) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } if (spa_writeable(spa)) { /* * Update the config cache to include the newly-imported pool. */ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } /* * It's possible that the pool was expanded while it was exported. * We kick off an async task to handle this for us. */ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); spa_history_log_version(spa, "import", NULL); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); mutex_exit(&spa_namespace_lock); zvol_create_minors_recursive(pool); spa_import_os(spa); return (0); } nvlist_t * spa_tryimport(nvlist_t *tryconfig) { nvlist_t *config = NULL; const char *poolname, *cachefile; spa_t *spa; uint64_t state; int error; zpool_load_policy_t policy; if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) return (NULL); if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) return (NULL); /* * Create and initialize the spa structure. */ mutex_enter(&spa_namespace_lock); spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); spa_activate(spa, SPA_MODE_READ); /* * Rewind pool if a max txg was provided. */ zpool_get_load_policy(spa->spa_config, &policy); if (policy.zlp_txg != UINT64_MAX) { spa->spa_load_max_txg = policy.zlp_txg; spa->spa_extreme_rewind = B_TRUE; zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", poolname, (longlong_t)policy.zlp_txg); } else { zfs_dbgmsg("spa_tryimport: importing %s", poolname); } if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) == 0) { zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; } else { spa->spa_config_source = SPA_CONFIG_SRC_SCAN; } /* * spa_import() relies on a pool config fetched by spa_try_import() * for spare/cache devices. Import flags are not passed to * spa_tryimport(), which makes it return early due to a missing log * device and missing retrieving the cache device and spare eventually. * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch * the correct configuration regardless of the missing log device. */ spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG; error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); /* * If 'tryconfig' was at least parsable, return the current config. */ if (spa->spa_root_vdev != NULL) { config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, spa->spa_uberblock.ub_timestamp); fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata); /* * If the bootfs property exists on this pool then we * copy it out so that external consumers can tell which * pools are bootable. */ if ((!error || error == EEXIST) && spa->spa_bootfs) { char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); /* * We have to play games with the name since the * pool was opened as TRYIMPORT_NAME. */ if (dsl_dsobj_to_dsname(spa_name(spa), spa->spa_bootfs, tmpname) == 0) { char *cp; char *dsname; dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); cp = strchr(tmpname, '/'); if (cp == NULL) { (void) strlcpy(dsname, tmpname, MAXPATHLEN); } else { (void) snprintf(dsname, MAXPATHLEN, "%s/%s", poolname, ++cp); } fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, dsname); kmem_free(dsname, MAXPATHLEN); } kmem_free(tmpname, MAXPATHLEN); } /* * Add the list of hot spares and level 2 cache devices. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_add_spares(spa, config); spa_add_l2cache(spa, config); spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (config); } /* * Pool export/destroy * * The act of destroying or exporting a pool is very simple. We make sure there * is no more pending I/O and any references to the pool are gone. Then, we * update the pool state and sync all the labels to disk, removing the * configuration from the cache afterwards. If the 'hardforce' flag is set, then * we don't sync the labels or remove the configuration cache. */ static int spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { int error; spa_t *spa; if (oldconfig) *oldconfig = NULL; if (!(spa_mode_global & SPA_MODE_WRITE)) return (SET_ERROR(EROFS)); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pool)) == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (spa->spa_is_exporting) { /* the pool is being exported by another thread */ mutex_exit(&spa_namespace_lock); return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); } spa->spa_is_exporting = B_TRUE; /* * Put a hold on the pool, drop the namespace lock, stop async tasks, * reacquire the namespace lock, and see if we can export. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); if (spa->spa_zvol_taskq) { zvol_remove_minors(spa, spa_name(spa), B_TRUE); taskq_wait(spa->spa_zvol_taskq); } mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); if (spa->spa_state == POOL_STATE_UNINITIALIZED) goto export_spa; /* * The pool will be in core if it's openable, in which case we can * modify its state. Objsets may be open only because they're dirty, * so we have to force it to sync before checking spa_refcnt. */ if (spa->spa_sync_on) { txg_wait_synced(spa->spa_dsl_pool, 0); spa_evicting_os_wait(spa); } /* * A pool cannot be exported or destroyed if there are active * references. If we are resetting a pool, allow references by * fault injection handlers. */ if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { error = SET_ERROR(EBUSY); goto fail; } if (spa->spa_sync_on) { vdev_t *rvd = spa->spa_root_vdev; /* * A pool cannot be exported if it has an active shared spare. * This is to prevent other pools stealing the active spare * from an exported pool. At user's own will, such pool can * be forcedly exported. */ if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { error = SET_ERROR(EXDEV); goto fail; } /* * We're about to export or destroy this pool. Make sure * we stop all initialization and trim activity here before * we set the spa_final_txg. This will ensure that all * dirty data resulting from the initialization is * committed to disk before we unload the pool. */ vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); vdev_rebuild_stop_all(spa); /* * We want this to be reflected on every label, * so mark them all dirty. spa_unload() will do the * final sync that pushes these changes out. */ if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; vdev_config_dirty(rvd); spa_config_exit(spa, SCL_ALL, FTAG); } /* * If the log space map feature is enabled and the pool is * getting exported (but not destroyed), we want to spend some * time flushing as many metaslabs as we can in an attempt to * destroy log space maps and save import time. This has to be * done before we set the spa_final_txg, otherwise * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs. * spa_should_flush_logs_on_unload() should be called after * spa_state has been set to the new_state. */ if (spa_should_flush_logs_on_unload(spa)) spa_unload_log_sm_flush_all(spa); if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_final_txg = spa_last_synced_txg(spa) + TXG_DEFER_SIZE + 1; spa_config_exit(spa, SCL_ALL, FTAG); } } export_spa: spa_export_os(spa); if (new_state == POOL_STATE_DESTROYED) spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); else if (new_state == POOL_STATE_EXPORTED) spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } if (oldconfig && spa->spa_config) *oldconfig = fnvlist_dup(spa->spa_config); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); spa_remove(spa); } else { /* * If spa_remove() is not called for this spa_t and * there is any possibility that it can be reused, * we make sure to reset the exporting flag. */ spa->spa_is_exporting = B_FALSE; } mutex_exit(&spa_namespace_lock); return (0); fail: spa->spa_is_exporting = B_FALSE; spa_async_resume(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Destroy a storage pool. */ int spa_destroy(const char *pool) { return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE, B_FALSE)); } /* * Export a storage pool. */ int spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force, hardforce)); } /* * Similar to spa_export(), this unloads the spa_t without actually removing it * from the namespace in any way. */ int spa_reset(const char *pool) { return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, B_FALSE, B_FALSE)); } /* * ========================================================================== * Device manipulation * ========================================================================== */ /* * This is called as a synctask to increment the draid feature flag */ static void spa_draid_feature_incr(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; int draid = (int)(uintptr_t)arg; for (int c = 0; c < draid; c++) spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); } /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { uint64_t txg, ndraid = 0; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, NULL, txg, error)); spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) != 0) nl2cache = 0; if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); if (vd->vdev_children != 0 && (error = vdev_create(vd, txg, B_FALSE)) != 0) { return (spa_vdev_exit(spa, vd, txg, error)); } /* * The virtual dRAID spares must be added after vdev tree is created * and the vdev guids are generated. The guid of their associated * dRAID is stored in the config and used when opening the spare. */ if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, rvd->vdev_children)) == 0) { if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; } else { return (spa_vdev_exit(spa, vd, txg, error)); } /* * We must validate the spares and l2cache devices after checking the * children. Otherwise, vdev_inuse() will blindly overwrite the spare. */ if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * If we are in the middle of a device removal, we can only add * devices which match the existing devices in the pool. * If we are in the middle of a removal, or have some indirect * vdevs, we can not add raidz or dRAID top levels. */ if (spa->spa_vdev_removal != NULL || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; if (spa->spa_vdev_removal != NULL && tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } /* Fail if top level vdev is raidz or a dRAID */ if (vdev_get_nparity(tvd) != 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); /* * Need the top level mirror to be * a mirror of leaf vdevs only */ if (tvd->vdev_ops == &vdev_mirror_ops) { for (uint64_t cid = 0; cid < tvd->vdev_children; cid++) { vdev_t *cvd = tvd->vdev_child[cid]; if (!cvd->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } } } } } for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); tvd->vdev_id = rvd->vdev_children; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); } if (nspares != 0) { spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, ZPOOL_CONFIG_SPARES); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } if (nl2cache != 0) { spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, ZPOOL_CONFIG_L2CACHE); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } /* * We can't increment a feature while holding spa_vdev so we * have to do it in a synctask. */ if (ndraid != 0) { dmu_tx_t *tx; tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, (void *)(uintptr_t)ndraid, tx); dmu_tx_commit(tx); } /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we * sync the config cache, and we lose power, then upon reboot we may * fail to open the pool because there are DVAs that the config cache * can't translate. Therefore, we first add the vdevs without * initializing metaslabs; sync the config cache (via spa_vdev_exit()); * and then let spa_config_update() initialize the new metaslabs. * * spa_load() checks for added-but-not-initialized vdevs, so that * if we lose power at any point in this sequence, the remaining * steps will be completed the next time we load the pool. */ (void) spa_vdev_exit(spa, vd, txg, 0); mutex_enter(&spa_namespace_lock); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); mutex_exit(&spa_namespace_lock); return (0); } /* * Attach a device to a mirror. The arguments are the path to any device * in the mirror, and the nvroot for the new device. If the path specifies * a device that is not mirrored, we automatically insert the mirror vdev. * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own * mirror using the 'replacing' vdev, which is functionally identical to * the mirror vdev (it actually reuses all the same ops) but has a few * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) * is automatically detached. * * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) * should be performed instead of traditional healing reconstruction. From * an administrators perspective these are both resilver operations. */ int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, int rebuild) { uint64_t txg, dtl_max_txg; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; int newvd_isspare; int error; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } if (rebuild) { if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); if (dsl_scan_resilvering(spa_get_dsl(spa)) || dsl_scan_resilver_scheduled(spa_get_dsl(spa))) { return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_RESILVER_IN_PROGRESS)); } } else { if (vdev_rebuild_active(rvd)) return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_REBUILD_IN_PROGRESS)); } if (spa->spa_vdev_removal != NULL) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!oldvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = oldvd->vdev_parent; if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, VDEV_ALLOC_ATTACH) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); if (newrootvd->vdev_children != 1) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); newvd = newrootvd->vdev_child[0]; if (!newvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); if ((error = vdev_create(newrootvd, txg, replacing)) != 0) return (spa_vdev_exit(spa, newrootvd, txg, error)); /* * log, dedup and special vdevs should not be replaced by spares. */ if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE || oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } /* * A dRAID spare can only replace a child of its parent dRAID vdev. */ if (newvd->vdev_ops == &vdev_draid_spare_ops && oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } if (rebuild) { /* * For rebuilds, the top vdev must support reconstruction * using only space maps. This means the only allowable * vdevs types are the root vdev, a mirror, or dRAID. */ tvd = pvd; if (pvd->vdev_top != NULL) tvd = pvd->vdev_top; if (tvd->vdev_ops != &vdev_mirror_ops && tvd->vdev_ops != &vdev_root_ops && tvd->vdev_ops != &vdev_draid_ops) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } } if (!replacing) { /* * For attach, the only allowable parent is a mirror or the root * vdev. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); pvops = &vdev_mirror_ops; } else { /* * Active hot spares can only be replaced by inactive hot * spares. */ if (pvd->vdev_ops == &vdev_spare_ops && oldvd->vdev_isspare && !spa_has_spare(spa, newvd->vdev_guid)) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If the source is a hot spare, and the parent isn't already a * spare, then we want to create a new hot spare. Otherwise, we * want to create a replacing vdev. The user is not allowed to * attach to a spared vdev child unless the 'isspare' state is * the same (spare replaces spare, non-spare replaces * non-spare). */ if (pvd->vdev_ops == &vdev_replacing_ops && spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } else if (pvd->vdev_ops == &vdev_spare_ops && newvd->vdev_isspare != oldvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } if (newvd->vdev_isspare) pvops = &vdev_spare_ops; else pvops = &vdev_replacing_ops; } /* * Make sure the new device is big enough. */ if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* * The new device cannot have a higher alignment requirement * than the top-level vdev. */ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. */ if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { spa_strfree(oldvd->vdev_path); oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, KM_SLEEP); (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5, "%s/%s", newvd->vdev_path, "old"); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } } /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ if (pvd->vdev_ops != pvops) pvd = vdev_add_parent(oldvd, pvops); ASSERT(pvd->vdev_top->vdev_parent == rvd); ASSERT(pvd->vdev_ops == pvops); ASSERT(oldvd->vdev_parent == pvd); /* * Extract the new device from its root and add it to pvd. */ vdev_remove_child(newrootvd, newvd); newvd->vdev_id = pvd->vdev_children; newvd->vdev_crtxg = oldvd->vdev_crtxg; vdev_add_child(pvd, newvd); /* * Reevaluate the parent vdev state. */ vdev_propagate_state(pvd); tvd = newvd->vdev_top; ASSERT(pvd->vdev_top == tvd); ASSERT(tvd->vdev_parent == rvd); vdev_config_dirty(tvd); /* * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account * for any dmu_sync-ed blocks. It will propagate upward when * spa_vdev_exit() calls vdev_dtl_reassess(). */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); } oldvdpath = spa_strdup(oldvd->vdev_path); newvdpath = spa_strdup(newvd->vdev_path); newvd_isspare = newvd->vdev_isspare; /* * Mark newvd's DTL dirty in this txg. */ vdev_dirty(tvd, VDD_DTL, newvd, txg); /* * Schedule the resilver or rebuild to restart in the future. We do * this to ensure that dmu_sync-ed blocks have been stitched into the * respective datasets. */ if (rebuild) { newvd->vdev_rebuild_txg = txg; vdev_rebuild(tvd); } else { newvd->vdev_resilver_txg = txg; if (dsl_scan_resilvering(spa_get_dsl(spa)) && spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { vdev_defer_resilver(newvd); } else { dsl_scan_restart_resilver(spa->spa_dsl_pool, dtl_max_txg); } } if (spa->spa_bootfs) spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); /* * Commit the config */ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); spa_history_log_internal(spa, "vdev attach", NULL, "%s vdev=%s %s vdev=%s", replacing && newvd_isspare ? "spare in" : replacing ? "replace" : "attach", newvdpath, replacing ? "for" : "to", oldvdpath); spa_strfree(oldvdpath); spa_strfree(newvdpath); return (0); } /* * Detach a device from a mirror or replacing vdev. * * If 'replace_done' is specified, only detach if the parent * is a replacing or a spare vdev. */ int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) { uint64_t txg; int error; vdev_t *rvd __maybe_unused = spa->spa_root_vdev; vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; uint64_t unspare_guid = 0; char *vdpath; ASSERT(spa_writeable(spa)); txg = spa_vdev_detach_enter(spa, guid); vd = spa_lookup_by_guid(spa, guid, B_FALSE); /* * Besides being called directly from the userland through the * ioctl interface, spa_vdev_detach() can be potentially called * at the end of spa_vdev_resilver_done(). * * In the regular case, when we have a checkpoint this shouldn't * happen as we never empty the DTLs of a vdev during the scrub * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() * should never get here when we have a checkpoint. * * That said, even in a case when we checkpoint the pool exactly * as spa_vdev_resilver_done() calls this function everything * should be fine as the resilver will return right away. */ ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } if (vd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = vd->vdev_parent; /* * If the parent/child relationship is not as expected, don't do it. * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing * vdev that's replacing B with C. The user's intent in replacing * is to go from M(A,B) to M(A,C). If the user decides to cancel * the replace by detaching C, the expected behavior is to end up * M(A,B). But suppose that right after deciding to detach C, * the replacement of B completes. We would have M(A,C), and then * ask to detach C, which would leave us with just A -- not what * the user wanted. To prevent this, we make sure that the * parent/child relationship hasn't changed -- in this example, * that C's parent is still the replacing vdev R. */ if (pvd->vdev_guid != pguid && pguid != 0) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); /* * Only 'replacing' or 'spare' vdevs can be replaced. */ if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ASSERT(pvd->vdev_ops != &vdev_spare_ops || spa_version(spa) >= SPA_VERSION_SPARES); /* * Only mirror, replacing, and spare vdevs support detach. */ if (pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); /* * If this device has the only valid copy of some data, * we cannot safely detach it. */ if (vdev_dtl_required(vd)) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); ASSERT(pvd->vdev_children >= 2); /* * If we are detaching the second disk from a replacing vdev, then * check to see if we changed the original vdev's path to have "/old" * at the end in spa_vdev_attach(). If so, undo that change now. */ if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && vd->vdev_path != NULL) { size_t len = strlen(vd->vdev_path); for (int c = 0; c < pvd->vdev_children; c++) { cvd = pvd->vdev_child[c]; if (cvd == vd || cvd->vdev_path == NULL) continue; if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && strcmp(cvd->vdev_path + len, "/old") == 0) { spa_strfree(cvd->vdev_path); cvd->vdev_path = spa_strdup(vd->vdev_path); break; } } } /* * If we are detaching the original disk from a normal spare, then it * implies that the spare should become a real disk, and be removed * from the active spare list for the pool. dRAID spares on the * other hand are coupled to the pool and thus should never be removed * from the spares list. */ if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; if (last_cvd->vdev_isspare && last_cvd->vdev_ops != &vdev_draid_spare_ops) { unspare = B_TRUE; } } /* * Erase the disk labels so the disk can be used for other things. * This must be done after all other error cases are handled, * but before we disembowel vd (so we can still do I/O to it). * But if we can't do it, don't treat the error as fatal -- * it may be that the unwritability of the disk is the reason * it's being detached! */ (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Remove vd from its parent and compact the parent's children. */ vdev_remove_child(pvd, vd); vdev_compact_children(pvd); /* * Remember one of the remaining children so we can get tvd below. */ cvd = pvd->vdev_child[pvd->vdev_children - 1]; /* * If we need to remove the remaining child from the list of hot spares, * do it now, marking the vdev as no longer a spare in the process. * We must do this before vdev_remove_parent(), because that can * change the GUID if it creates a new toplevel GUID. For a similar * reason, we must remove the spare now, in the same txg as the detach; * otherwise someone could attach a new sibling, change the GUID, and * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. */ if (unspare) { ASSERT(cvd->vdev_isspare); spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); cvd->vdev_unspare = B_TRUE; } /* * If the parent mirror/replacing vdev only has one child, * the parent is no longer needed. Remove it from the tree. */ if (pvd->vdev_children == 1) { if (pvd->vdev_ops == &vdev_spare_ops) cvd->vdev_unspare = B_FALSE; vdev_remove_parent(cvd); } /* * We don't set tvd until now because the parent we just removed * may have been the previous top-level vdev. */ tvd = cvd->vdev_top; ASSERT(tvd->vdev_parent == rvd); /* * Reevaluate the parent vdev state. */ vdev_propagate_state(cvd); /* * If the 'autoexpand' property is set on the pool then automatically * try to expand the size of the pool. For example if the device we * just detached was smaller than the others, it may be possible to * add metaslabs (i.e. grow the pool). We need to reopen the vdev * first so that we can obtain the updated sizes of the leaf vdevs. */ if (spa->spa_autoexpand) { vdev_reopen(tvd); vdev_expand(tvd, txg); } vdev_config_dirty(tvd); /* * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that * vd->vdev_detached is set and free vd's DTL object in syncing context. * But first make sure we're not on any *other* txg's DTL list, to * prevent vd from being accessed after it's freed. */ vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); for (int t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); spa_notify_waiters(spa); /* hang on to the spa before we release the lock */ spa_open_ref(spa, FTAG); error = spa_vdev_exit(spa, vd, txg, 0); spa_history_log_internal(spa, "detach", NULL, "vdev=%s", vdpath); spa_strfree(vdpath); /* * If this was the removal of the original device in a hot spare vdev, * then we want to go through and remove the device from the hot spare * list of every other pool. */ if (unspare) { spa_t *altspa = NULL; mutex_enter(&spa_namespace_lock); while ((altspa = spa_next(altspa)) != NULL) { if (altspa->spa_state != POOL_STATE_ACTIVE || altspa == spa) continue; spa_open_ref(altspa, FTAG); mutex_exit(&spa_namespace_lock); (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); mutex_enter(&spa_namespace_lock); spa_close(altspa, FTAG); } mutex_exit(&spa_namespace_lock); /* search the rest of the vdevs for spares to remove */ spa_vdev_resilver_done(spa); } /* all done with the spa; OK to release */ mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); mutex_exit(&spa_namespace_lock); return (error); } static int spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, list_t *vd_list) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); /* Look up vdev and ensure it's a leaf. */ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || vd->vdev_detached) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(ENODEV)); } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EINVAL)); } else if (!vdev_writeable(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EROFS)); } mutex_enter(&vd->vdev_initialize_lock); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); /* * When we activate an initialize action we check to see * if the vdev_initialize_thread is NULL. We do this instead * of using the vdev_initialize_state since there might be * a previous initialization process which has completed but * the thread is not exited. */ if (cmd_type == POOL_INITIALIZE_START && (vd->vdev_initialize_thread != NULL || vd->vdev_top->vdev_removing)) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_INITIALIZE_CANCEL && (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(ESRCH)); } else if (cmd_type == POOL_INITIALIZE_SUSPEND && vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(ESRCH)); } else if (cmd_type == POOL_INITIALIZE_UNINIT && vd->vdev_initialize_thread != NULL) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(EBUSY)); } switch (cmd_type) { case POOL_INITIALIZE_START: vdev_initialize(vd); break; case POOL_INITIALIZE_CANCEL: vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); break; case POOL_INITIALIZE_SUSPEND: vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); break; case POOL_INITIALIZE_UNINIT: vdev_uninitialize(vd); break; default: panic("invalid cmd_type %llu", (unsigned long long)cmd_type); } mutex_exit(&vd->vdev_initialize_lock); return (0); } int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, nvlist_t *vdev_errlist) { int total_errors = 0; list_t vd_list; list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); /* * We hold the namespace lock through the whole function * to prevent any changes to the pool while we're starting or * stopping initialization. The config and state locks are held so that * we can properly assess the vdev state before we commit to * the initializing operation. */ mutex_enter(&spa_namespace_lock); for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { uint64_t vdev_guid = fnvpair_value_uint64(pair); int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, &vd_list); if (error != 0) { char guid_as_str[MAXNAMELEN]; (void) snprintf(guid_as_str, sizeof (guid_as_str), "%llu", (unsigned long long)vdev_guid); fnvlist_add_int64(vdev_errlist, guid_as_str, error); total_errors++; } } /* Wait for all initialize threads to stop. */ vdev_initialize_stop_wait(spa, &vd_list); /* Sync out the initializing state */ txg_wait_synced(spa->spa_dsl_pool, 0); mutex_exit(&spa_namespace_lock); list_destroy(&vd_list); return (total_errors); } static int spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); /* Look up vdev and ensure it's a leaf. */ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || vd->vdev_detached) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(ENODEV)); } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EINVAL)); } else if (!vdev_writeable(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EROFS)); } else if (!vd->vdev_has_trim) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EOPNOTSUPP)); } else if (secure && !vd->vdev_has_securetrim) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EOPNOTSUPP)); } mutex_enter(&vd->vdev_trim_lock); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); /* * When we activate a TRIM action we check to see if the * vdev_trim_thread is NULL. We do this instead of using the * vdev_trim_state since there might be a previous TRIM process * which has completed but the thread is not exited. */ if (cmd_type == POOL_TRIM_START && (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_TRIM_CANCEL && (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(ESRCH)); } else if (cmd_type == POOL_TRIM_SUSPEND && vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(ESRCH)); } switch (cmd_type) { case POOL_TRIM_START: vdev_trim(vd, rate, partial, secure); break; case POOL_TRIM_CANCEL: vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); break; case POOL_TRIM_SUSPEND: vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); break; default: panic("invalid cmd_type %llu", (unsigned long long)cmd_type); } mutex_exit(&vd->vdev_trim_lock); return (0); } /* * Initiates a manual TRIM for the requested vdevs. This kicks off individual * TRIM threads for each child vdev. These threads pass over all of the free * space in the vdev's metaslabs and issues TRIM commands for that space. */ int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) { int total_errors = 0; list_t vd_list; list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); /* * We hold the namespace lock through the whole function * to prevent any changes to the pool while we're starting or * stopping TRIM. The config and state locks are held so that * we can properly assess the vdev state before we commit to * the TRIM operation. */ mutex_enter(&spa_namespace_lock); for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { uint64_t vdev_guid = fnvpair_value_uint64(pair); int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, rate, partial, secure, &vd_list); if (error != 0) { char guid_as_str[MAXNAMELEN]; (void) snprintf(guid_as_str, sizeof (guid_as_str), "%llu", (unsigned long long)vdev_guid); fnvlist_add_int64(vdev_errlist, guid_as_str, error); total_errors++; } } /* Wait for all TRIM threads to stop. */ vdev_trim_stop_wait(spa, &vd_list); /* Sync out the TRIM state */ txg_wait_synced(spa->spa_dsl_pool, 0); mutex_exit(&spa_namespace_lock); list_destroy(&vd_list); return (total_errors); } /* * Split a set of devices from their mirrors, and create a new pool from them. */ int spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp) { int error = 0; uint64_t txg, *glist; spa_t *newspa; uint_t c, children, lastlog; nvlist_t **child, *nvl, *tmp; dmu_tx_t *tx; const char *altroot = NULL; vdev_t *rvd, **vml = NULL; /* vdev modify list */ boolean_t activate_slog; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } /* clear the log and flush everything up to now */ activate_slog = spa_passivate_log(spa); (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); error = spa_reset_logs(spa); txg = spa_vdev_config_enter(spa); if (activate_slog) spa_activate_log(spa); if (error != 0) return (spa_vdev_exit(spa, NULL, txg, error)); /* check new spa name before going any further */ if (spa_lookup(newname) != NULL) return (spa_vdev_exit(spa, NULL, txg, EEXIST)); /* * scan through all the children to ensure they're all mirrors */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* first, check to ensure we've got the right child count */ rvd = spa->spa_root_vdev; lastlog = 0; for (c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && !vdev_is_concrete(vd))) { if (lastlog == 0) lastlog = c; continue; } lastlog = 0; } if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* next, ensure no spare or cache devices are part of the split */ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); /* then, loop over each vdev and validate it */ for (c = 0; c < children; c++) { uint64_t is_hole = 0; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_hole != 0) { if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || spa->spa_root_vdev->vdev_child[c]->vdev_islog) { continue; } else { error = SET_ERROR(EINVAL); break; } } /* deal with indirect vdevs */ if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == &vdev_indirect_ops) continue; /* which disk is going to be split? */ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, &glist[c]) != 0) { error = SET_ERROR(EINVAL); break; } /* look it up in the spa */ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); if (vml[c] == NULL) { error = SET_ERROR(ENODEV); break; } /* make sure there's nothing stopping the split */ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || vml[c]->vdev_islog || !vdev_is_concrete(vml[c]) || vml[c]->vdev_isspare || vml[c]->vdev_isl2cache || !vdev_writeable(vml[c]) || vml[c]->vdev_children != 0 || vml[c]->vdev_state != VDEV_STATE_HEALTHY || c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { error = SET_ERROR(EINVAL); break; } if (vdev_dtl_required(vml[c]) || vdev_resilver_needed(vml[c], NULL, NULL)) { error = SET_ERROR(EBUSY); break; } /* we need certain info from the top level */ fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, vml[c]->vdev_top->vdev_ms_array); fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, vml[c]->vdev_top->vdev_ms_shift); fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, vml[c]->vdev_top->vdev_asize); fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, vml[c]->vdev_top->vdev_ashift); /* transfer per-vdev ZAPs */ ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); VERIFY0(nvlist_add_uint64(child[c], ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); VERIFY0(nvlist_add_uint64(child[c], ZPOOL_CONFIG_VDEV_TOP_ZAP, vml[c]->vdev_parent->vdev_top_zap)); } if (error != 0) { kmem_free(vml, children * sizeof (vdev_t *)); kmem_free(glist, children * sizeof (uint64_t)); return (spa_vdev_exit(spa, NULL, txg, error)); } /* stop writers from using the disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_TRUE; } vdev_reopen(spa->spa_root_vdev); /* * Temporarily record the splitting vdevs in the spa config. This * will disappear once the config is regenerated. */ nvl = fnvlist_alloc(); fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children); kmem_free(glist, children * sizeof (uint64_t)); mutex_enter(&spa->spa_props_lock); fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl); mutex_exit(&spa->spa_props_lock); spa->spa_config_splitting = nvl; vdev_config_dirty(spa->spa_root_vdev); /* configure and create the new pool */ fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE); fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_generate_guid(NULL)); VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); /* add the new pool to the namespace */ newspa = spa_add(newname, config, altroot); newspa->spa_avz_action = AVZ_ACTION_REBUILD; newspa->spa_config_txg = spa->spa_config_txg; spa_set_log_state(newspa, SPA_LOG_CLEAR); /* release the spa config lock, retaining the namespace lock */ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 1); spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); /* * Temporarily stop the initializing and TRIM activity. We set the * state to ACTIVE so that we know to resume initializing or TRIM * once the split has completed. */ list_t vd_initialize_list; list_create(&vd_initialize_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); list_t vd_trim_list; list_create(&vd_trim_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); for (c = 0; c < children; c++) { if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { mutex_enter(&vml[c]->vdev_initialize_lock); vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); mutex_exit(&vml[c]->vdev_initialize_lock); mutex_enter(&vml[c]->vdev_trim_lock); vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); mutex_exit(&vml[c]->vdev_trim_lock); } } vdev_initialize_stop_wait(spa, &vd_initialize_list); vdev_trim_stop_wait(spa, &vd_trim_list); list_destroy(&vd_initialize_list); list_destroy(&vd_trim_list); newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; newspa->spa_is_splitting = B_TRUE; /* create the new pool from the disks of the original pool */ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); if (error) goto out; /* if that worked, generate a real config for the new pool */ if (newspa->spa_root_vdev != NULL) { newspa->spa_config_splitting = fnvlist_alloc(); fnvlist_add_uint64(newspa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)); spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, B_TRUE)); } /* set the props */ if (props != NULL) { spa_configfile_set(newspa, props, B_FALSE); error = spa_prop_set(newspa, props); if (error) goto out; } /* flush everything */ txg = spa_vdev_config_enter(newspa); vdev_config_dirty(newspa->spa_root_vdev); (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 2); spa_async_resume(newspa); /* finally, update the original pool's config */ txg = spa_vdev_config_enter(spa); tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) dmu_tx_abort(tx); for (c = 0; c < children; c++) { if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { vdev_t *tvd = vml[c]->vdev_top; /* * Need to be sure the detachable VDEV is not * on any *other* txg's DTL list to prevent it * from being accessed after it's freed. */ for (int t = 0; t < TXG_SIZE; t++) { (void) txg_list_remove_this( &tvd->vdev_dtl_list, vml[c], t); } vdev_split(vml[c]); if (error == 0) spa_history_log_internal(spa, "detach", tx, "vdev=%s", vml[c]->vdev_path); vdev_free(vml[c]); } } spa->spa_avz_action = AVZ_ACTION_REBUILD; vdev_config_dirty(spa->spa_root_vdev); spa->spa_config_splitting = NULL; nvlist_free(nvl); if (error == 0) dmu_tx_commit(tx); (void) spa_vdev_exit(spa, NULL, txg, 0); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 3); /* split is complete; log a history record */ spa_history_log_internal(newspa, "split", NULL, "from pool %s", spa_name(spa)); newspa->spa_is_splitting = B_FALSE; kmem_free(vml, children * sizeof (vdev_t *)); /* if we're not going to mount the filesystems in userland, export */ if (exp) error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, B_FALSE, B_FALSE); return (error); out: spa_unload(newspa); spa_deactivate(newspa); spa_remove(newspa); txg = spa_vdev_config_enter(spa); /* re-online all offlined disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_FALSE; } /* restart initializing or trimming disks as necessary */ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); vdev_reopen(spa->spa_root_vdev); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; (void) spa_vdev_exit(spa, NULL, txg, error); kmem_free(vml, children * sizeof (vdev_t *)); return (error); } /* * Find any device that's done replacing, or a vdev marked 'unspare' that's * currently spared, so we can detach it. */ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; for (int c = 0; c < vd->vdev_children; c++) { oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); } /* * Check for a completed replacement. We always consider the first * vdev in the list to be the oldest vdev, and the last one to be * the newest (see spa_vdev_attach() for how that works). In * the case where the newest vdev is faulted, we will not automatically * remove it after a resilver completes. This is OK as it will require * user intervention to determine which disk the admin wishes to keep. */ if (vd->vdev_ops == &vdev_replacing_ops) { ASSERT(vd->vdev_children > 1); newvd = vd->vdev_child[vd->vdev_children - 1]; oldvd = vd->vdev_child[0]; if (vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); } /* * Check for a completed resilver with the 'unspare' flag set. * Also potentially update faulted state. */ if (vd->vdev_ops == &vdev_spare_ops) { vdev_t *first = vd->vdev_child[0]; vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; if (last->vdev_unspare) { oldvd = first; newvd = last; } else if (first->vdev_unspare) { oldvd = last; newvd = first; } else { oldvd = NULL; } if (oldvd != NULL && vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); vdev_propagate_state(vd); /* * If there are more than two spares attached to a disk, * and those spares are not required, then we want to * attempt to free them up now so that they can be used * by other pools. Once we're back down to a single * disk+spare, we stop removing them. */ if (vd->vdev_children > 2) { newvd = vd->vdev_child[1]; if (newvd->vdev_isspare && last->vdev_isspare && vdev_dtl_empty(last, DTL_MISSING) && vdev_dtl_empty(last, DTL_OUTAGE) && !vdev_dtl_required(newvd)) return (newvd); } } return (NULL); } static void spa_vdev_resilver_done(spa_t *spa) { vdev_t *vd, *pvd, *ppvd; uint64_t guid, sguid, pguid, ppguid; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { pvd = vd->vdev_parent; ppvd = pvd->vdev_parent; guid = vd->vdev_guid; pguid = pvd->vdev_guid; ppguid = ppvd->vdev_guid; sguid = 0; /* * If we have just finished replacing a hot spared device, then * we need to detach the parent's first child (the original hot * spare) as well. */ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && ppvd->vdev_children == 2) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); sguid = ppvd->vdev_child[1]->vdev_guid; } ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); spa_config_exit(spa, SCL_ALL, FTAG); if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) return; if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) return; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); } spa_config_exit(spa, SCL_ALL, FTAG); /* * If a detach was not performed above replace waiters will not have * been notified. In which case we must do so now. */ spa_notify_waiters(spa); } /* * Update the stored path or FRU for this vdev. */ static int spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, boolean_t ispath) { vdev_t *vd; boolean_t sync = B_FALSE; ASSERT(spa_writeable(spa)); spa_vdev_state_enter(spa, SCL_ALL); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENOENT)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); if (ispath) { if (strcmp(value, vd->vdev_path) != 0) { spa_strfree(vd->vdev_path); vd->vdev_path = spa_strdup(value); sync = B_TRUE; } } else { if (vd->vdev_fru == NULL) { vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } else if (strcmp(value, vd->vdev_fru) != 0) { spa_strfree(vd->vdev_fru); vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } } return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); } int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) { return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); } int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) { return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); } /* * ========================================================================== * SPA Scanning * ========================================================================== */ int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); } int spa_scan_stop(spa_t *spa) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scan_cancel(spa->spa_dsl_pool)); } int spa_scan(spa_t *spa, pool_scan_func_t func) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) return (SET_ERROR(ENOTSUP)); if (func == POOL_SCAN_RESILVER && !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) return (SET_ERROR(ENOTSUP)); /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. */ if (func == POOL_SCAN_RESILVER && !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); return (0); } if (func == POOL_SCAN_ERRORSCRUB && !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) return (SET_ERROR(ENOTSUP)); return (dsl_scan(spa->spa_dsl_pool, func)); } /* * ========================================================================== * SPA async task processing * ========================================================================== */ static void spa_async_remove(spa_t *spa, vdev_t *vd) { if (vd->vdev_remove_wanted) { vd->vdev_remove_wanted = B_FALSE; vd->vdev_delayed_close = B_FALSE; vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); /* * We want to clear the stats, but we don't want to do a full * vdev_clear() as that will cause us to throw away * degraded/faulted state as well as attempt to reopen the * device, all of which is a waste. */ vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vdev_state_dirty(vd->vdev_top); /* Tell userspace that the vdev is gone. */ zfs_post_remove(spa, vd); } for (int c = 0; c < vd->vdev_children; c++) spa_async_remove(spa, vd->vdev_child[c]); } static void spa_async_probe(spa_t *spa, vdev_t *vd) { if (vd->vdev_probe_wanted) { vd->vdev_probe_wanted = B_FALSE; vdev_reopen(vd); /* vdev_open() does the actual probe */ } for (int c = 0; c < vd->vdev_children; c++) spa_async_probe(spa, vd->vdev_child[c]); } static void spa_async_autoexpand(spa_t *spa, vdev_t *vd) { if (!spa->spa_autoexpand) return; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; spa_async_autoexpand(spa, cvd); } if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) return; spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); } static __attribute__((noreturn)) void spa_async_thread(void *arg) { spa_t *spa = (spa_t *)arg; dsl_pool_t *dp = spa->spa_dsl_pool; int tasks; ASSERT(spa->spa_sync_on); mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; spa->spa_async_tasks = 0; mutex_exit(&spa->spa_async_lock); /* * See if the config needs to be updated. */ if (tasks & SPA_ASYNC_CONFIG_UPDATE) { uint64_t old_space, new_space; mutex_enter(&spa_namespace_lock); old_space = metaslab_class_get_space(spa_normal_class(spa)); old_space += metaslab_class_get_space(spa_special_class(spa)); old_space += metaslab_class_get_space(spa_dedup_class(spa)); old_space += metaslab_class_get_space( spa_embedded_log_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); new_space = metaslab_class_get_space(spa_normal_class(spa)); new_space += metaslab_class_get_space(spa_special_class(spa)); new_space += metaslab_class_get_space(spa_dedup_class(spa)); new_space += metaslab_class_get_space( spa_embedded_log_class(spa)); mutex_exit(&spa_namespace_lock); /* * If the pool grew as a result of the config update, * then log an internal history event. */ if (new_space != old_space) { spa_history_log_internal(spa, "vdev online", NULL, "pool '%s' size: %llu(+%llu)", spa_name(spa), (u_longlong_t)new_space, (u_longlong_t)(new_space - old_space)); } } /* * See if any devices need to be marked REMOVED. */ if (tasks & SPA_ASYNC_REMOVE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_remove(spa, spa->spa_root_vdev); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); for (int i = 0; i < spa->spa_spares.sav_count; i++) spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); (void) spa_vdev_state_exit(spa, NULL, 0); } if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_async_autoexpand(spa, spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } /* * See if any devices need to be probed. */ if (tasks & SPA_ASYNC_PROBE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_probe(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } /* * If any devices are done replacing, detach them. */ if (tasks & SPA_ASYNC_RESILVER_DONE || tasks & SPA_ASYNC_REBUILD_DONE || tasks & SPA_ASYNC_DETACH_SPARE) { spa_vdev_resilver_done(spa); } /* * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER && !vdev_rebuild_active(spa->spa_root_vdev) && (!dsl_scan_resilvering(dp) || !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) dsl_scan_restart_resilver(dp, 0); if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_initialize_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } if (tasks & SPA_ASYNC_TRIM_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_autotrim_restart(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } /* * Kick off L2 cache whole device TRIM. */ if (tasks & SPA_ASYNC_L2CACHE_TRIM) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_l2arc(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } /* * Kick off L2 cache rebuilding. */ if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); l2arc_spa_rebuild_start(spa); spa_config_exit(spa, SCL_L2ARC, FTAG); mutex_exit(&spa_namespace_lock); } /* * Let the world know that we're done. */ mutex_enter(&spa->spa_async_lock); spa->spa_async_thread = NULL; cv_broadcast(&spa->spa_async_cv); mutex_exit(&spa->spa_async_lock); thread_exit(); } void spa_async_suspend(spa_t *spa) { mutex_enter(&spa->spa_async_lock); spa->spa_async_suspended++; while (spa->spa_async_thread != NULL) cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); mutex_exit(&spa->spa_async_lock); spa_vdev_remove_suspend(spa); zthr_t *condense_thread = spa->spa_condense_zthr; if (condense_thread != NULL) zthr_cancel(condense_thread); zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_cancel(discard_thread); zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; if (ll_delete_thread != NULL) zthr_cancel(ll_delete_thread); zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; if (ll_condense_thread != NULL) zthr_cancel(ll_condense_thread); } void spa_async_resume(spa_t *spa) { mutex_enter(&spa->spa_async_lock); ASSERT(spa->spa_async_suspended != 0); spa->spa_async_suspended--; mutex_exit(&spa->spa_async_lock); spa_restart_removal(spa); zthr_t *condense_thread = spa->spa_condense_zthr; if (condense_thread != NULL) zthr_resume(condense_thread); zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_resume(discard_thread); zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; if (ll_delete_thread != NULL) zthr_resume(ll_delete_thread); zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; if (ll_condense_thread != NULL) zthr_resume(ll_condense_thread); } static boolean_t spa_async_tasks_pending(spa_t *spa) { uint_t non_config_tasks; uint_t config_task; boolean_t config_task_suspended; non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; if (spa->spa_ccw_fail_time == 0) { config_task_suspended = B_FALSE; } else { config_task_suspended = (gethrtime() - spa->spa_ccw_fail_time) < ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); } return (non_config_tasks || (config_task && !config_task_suspended)); } static void spa_async_dispatch(spa_t *spa) { mutex_enter(&spa->spa_async_lock); if (spa_async_tasks_pending(spa) && !spa->spa_async_suspended && spa->spa_async_thread == NULL) spa->spa_async_thread = thread_create(NULL, 0, spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); } void spa_async_request(spa_t *spa, int task) { zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks |= task; mutex_exit(&spa->spa_async_lock); } int spa_async_tasks(spa_t *spa) { return (spa->spa_async_tasks); } /* * ========================================================================== * SPA syncing routines * ========================================================================== */ static int bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { bpobj_t *bpo = arg; bpobj_enqueue(bpo, bp, bp_freed, tx); return (0); } int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); } int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); } static int spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zio_t *pio = arg; zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, pio->io_flags)); return (0); } static int bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(!bp_freed); return (spa_free_sync_cb(arg, bp, tx)); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing frees. */ static void spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) { zio_t *zio = zio_root(spa, NULL, NULL, 0); bplist_iterate(bpl, spa_free_sync_cb, zio, tx); VERIFY(zio_wait(zio) == 0); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing deferred frees. */ static void spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) { if (spa_sync_pass(spa) != 1) return; /* * Note: * If the log space map feature is active, we stop deferring * frees to the next TXG and therefore running this function * would be considered a no-op as spa_deferred_bpobj should * not have any entries. * * That said we run this function anyway (instead of returning * immediately) for the edge-case scenario where we just * activated the log space map feature in this TXG but we have * deferred frees from the previous TXG. */ zio_t *zio = zio_root(spa, NULL, NULL, 0); VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, bpobj_spa_free_sync_cb, zio, tx), ==, 0); VERIFY0(zio_wait(zio)); } static void spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) { char *packed = NULL; size_t bufsize; size_t nvsize = 0; dmu_buf_t *db; VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); /* * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration * information. This avoids the dmu_buf_will_dirty() path and * saves us a pre-read to get data we don't actually care about. */ bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); packed = vmem_alloc(bufsize, KM_SLEEP); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, KM_SLEEP) == 0); memset(packed + nvsize, 0, bufsize - nvsize); dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); vmem_free(packed, bufsize); VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); *(uint64_t *)db->db_data = nvsize; dmu_buf_rele(db, FTAG); } static void spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, const char *config, const char *entry) { nvlist_t *nvroot; nvlist_t **list; int i; if (!sav->sav_sync) return; /* * Update the MOS nvlist describing the list of available devices. * spa_validate_aux() will have already made sure this nvlist is * valid and the vdevs are labeled appropriately. */ if (sav->sav_object == 0) { sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); VERIFY(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, &sav->sav_object, tx) == 0); } nvroot = fnvlist_alloc(); if (sav->sav_count == 0) { fnvlist_add_nvlist_array(nvroot, config, (const nvlist_t * const *)NULL, 0); } else { list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_FALSE, VDEV_CONFIG_L2CACHE); fnvlist_add_nvlist_array(nvroot, config, (const nvlist_t * const *)list, sav->sav_count); for (i = 0; i < sav->sav_count; i++) nvlist_free(list[i]); kmem_free(list, sav->sav_count * sizeof (void *)); } spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); nvlist_free(nvroot); sav->sav_sync = B_FALSE; } /* * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. * The all-vdev ZAP must be empty. */ static void spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; if (vd->vdev_root_zap != 0 && spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_root_zap, tx)); } if (vd->vdev_top_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_top_zap, tx)); } if (vd->vdev_leaf_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_leaf_zap, tx)); } for (uint64_t i = 0; i < vd->vdev_children; i++) { spa_avz_build(vd->vdev_child[i], avz, tx); } } static void spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) { nvlist_t *config; /* * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, * its config may not be dirty but we still need to build per-vdev ZAPs. * Similarly, if the pool is being assembled (e.g. after a split), we * need to rebuild the AVZ although the config may not be dirty. */ if (list_is_empty(&spa->spa_config_dirty_list) && spa->spa_avz_action == AVZ_ACTION_NONE) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || spa->spa_avz_action == AVZ_ACTION_INITIALIZE || spa->spa_all_vdev_zaps != 0); if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { /* Make and build the new AVZ */ uint64_t new_avz = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); spa_avz_build(spa->spa_root_vdev, new_avz, tx); /* Diff old AVZ with new one */ zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_all_vdev_zaps); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t vdzap = za.za_first_integer; if (zap_lookup_int(spa->spa_meta_objset, new_avz, vdzap) == ENOENT) { /* * ZAP is listed in old AVZ but not in new one; * destroy it */ VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, tx)); } } zap_cursor_fini(&zc); /* Destroy the old AVZ */ VERIFY0(zap_destroy(spa->spa_meta_objset, spa->spa_all_vdev_zaps, tx)); /* Replace the old AVZ in the dir obj with the new one */ VERIFY0(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, sizeof (new_avz), 1, &new_avz, tx)); spa->spa_all_vdev_zaps = new_avz; } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { zap_cursor_t zc; zap_attribute_t za; /* Walk through the AVZ and destroy all listed ZAPs */ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_all_vdev_zaps); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t zap = za.za_first_integer; VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); } zap_cursor_fini(&zc); /* Destroy and unlink the AVZ itself */ VERIFY0(zap_destroy(spa->spa_meta_objset, spa->spa_all_vdev_zaps, tx)); VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); spa->spa_all_vdev_zaps = 0; } if (spa->spa_all_vdev_zaps == 0) { spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx); } spa->spa_avz_action = AVZ_ACTION_NONE; /* Create ZAPs for vdevs that don't have them. */ vdev_construct_zaps(spa->spa_root_vdev, tx); config = spa_config_generate(spa, spa->spa_root_vdev, dmu_tx_get_txg(tx), B_FALSE); /* * If we're upgrading the spa version then make sure that * the config object gets updated with the correct version. */ if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa->spa_uberblock.ub_version); spa_config_exit(spa, SCL_STATE, FTAG); nvlist_free(spa->spa_config_syncing); spa->spa_config_syncing = config; spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } static void spa_sync_version(void *arg, dmu_tx_t *tx) { uint64_t *versionp = arg; uint64_t version = *versionp; spa_t *spa = dmu_tx_pool(tx)->dp_spa; /* * Setting the version is special cased when first creating the pool. */ ASSERT(tx->tx_txg != TXG_INITIAL); ASSERT(SPA_VERSION_IS_SUPPORTED(version)); ASSERT(version >= spa_version(spa)); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_history_log_internal(spa, "set", tx, "version=%lld", (longlong_t)version); } /* * Set zpool properties. */ static void spa_sync_props(void *arg, dmu_tx_t *tx) { nvlist_t *nvp = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvp, elem))) { uint64_t intval; const char *strval, *fname; zpool_prop_t prop; const char *propname; const char *elemname = nvpair_name(elem); zprop_type_t proptype; spa_feature_t fid; switch (prop = zpool_name_to_prop(elemname)) { case ZPOOL_PROP_VERSION: intval = fnvpair_value_uint64(elem); /* * The version is synced separately before other * properties and should be correct by now. */ ASSERT3U(spa_version(spa), >=, intval); break; case ZPOOL_PROP_ALTROOT: /* * 'altroot' is a non-persistent property. It should * have been set temporarily at creation or import time. */ ASSERT(spa->spa_root != NULL); break; case ZPOOL_PROP_READONLY: case ZPOOL_PROP_CACHEFILE: /* * 'readonly' and 'cachefile' are also non-persistent * properties. */ break; case ZPOOL_PROP_COMMENT: strval = fnvpair_value_string(elem); if (spa->spa_comment != NULL) spa_strfree(spa->spa_comment); spa->spa_comment = spa_strdup(strval); /* * We need to dirty the configuration on all the vdevs * so that their labels get updated. We also need to * update the cache file to keep it in sync with the * MOS version. It's unnecessary to do this for pool * creation since the vdev's configuration has already * been dirtied. */ if (tx->tx_txg != TXG_INITIAL) { vdev_config_dirty(spa->spa_root_vdev); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } spa_history_log_internal(spa, "set", tx, "%s=%s", elemname, strval); break; case ZPOOL_PROP_COMPATIBILITY: strval = fnvpair_value_string(elem); if (spa->spa_compatibility != NULL) spa_strfree(spa->spa_compatibility); spa->spa_compatibility = spa_strdup(strval); /* * Dirty the configuration on vdevs as above. */ if (tx->tx_txg != TXG_INITIAL) { vdev_config_dirty(spa->spa_root_vdev); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); break; case ZPOOL_PROP_INVAL: if (zpool_prop_feature(elemname)) { fname = strchr(elemname, '@') + 1; VERIFY0(zfeature_lookup_name(fname, &fid)); spa_feature_enable(spa, fid, tx); spa_history_log_internal(spa, "set", tx, "%s=enabled", elemname); break; } else if (!zfs_prop_user(elemname)) { ASSERT(zpool_prop_feature(elemname)); break; } zfs_fallthrough; default: /* * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { spa->spa_pool_props_object = zap_create_link(mos, DMU_OT_POOL_PROPS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, tx); } /* normalize the property name */ if (prop == ZPOOL_PROP_INVAL) { propname = elemname; proptype = PROP_TYPE_STRING; } else { propname = zpool_prop_to_name(prop); proptype = zpool_prop_get_type(prop); } if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%s", elemname, strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY0(zpool_prop_index_to_string( prop, intval, &unused)); } VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 8, 1, &intval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%lld", elemname, (longlong_t)intval); switch (prop) { case ZPOOL_PROP_DELEGATION: spa->spa_delegation = intval; break; case ZPOOL_PROP_BOOTFS: spa->spa_bootfs = intval; break; case ZPOOL_PROP_FAILUREMODE: spa->spa_failmode = intval; break; case ZPOOL_PROP_AUTOTRIM: spa->spa_autotrim = intval; spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); break; case ZPOOL_PROP_AUTOEXPAND: spa->spa_autoexpand = intval; if (tx->tx_txg != TXG_INITIAL) spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); break; case ZPOOL_PROP_MULTIHOST: spa->spa_multihost = intval; break; default: break; } } else { ASSERT(0); /* not allowed */ } } } mutex_exit(&spa->spa_props_lock); } /* * Perform one-time upgrade on-disk changes. spa_version() does not * reflect the new version this txg, so there must be no changes this * txg to anything that the upgrade code depends on after it executes. * Therefore this must be called after dsl_pool_sync() does the sync * tasks. */ static void spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) { if (spa_sync_pass(spa) != 1) return; dsl_pool_t *dp = spa->spa_dsl_pool; rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { dsl_pool_create_origin(dp, tx); /* Keeping the origin open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { dsl_pool_upgrade_clones(dp, tx); } if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { dsl_pool_upgrade_dir_clones(dp, tx); /* Keeping the freedir open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { spa_feature_create_zap_objects(spa, tx); } /* * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable * when possibility to use lz4 compression for metadata was added * Old pools that have this feature enabled must be upgraded to have * this feature active */ if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { boolean_t lz4_en = spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS); boolean_t lz4_ac = spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS); if (lz4_en && !lz4_ac) spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); } /* * If we haven't written the salt, do so now. Note that the * feature may not be activated yet, but that's fine since * the presence of this ZAP entry is backwards compatible. */ if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT) == ENOENT) { VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes, tx)); } rrw_exit(&dp->dp_config_rwlock, FTAG); } static void vdev_indirect_state_sync_verify(vdev_t *vd) { vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(vim != NULL); ASSERT(vib != NULL); } uint64_t obsolete_sm_object = 0; ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object != 0) { ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); ASSERT3U(obsolete_sm_object, ==, space_map_object(vd->vdev_obsolete_sm)); ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, space_map_allocated(vd->vdev_obsolete_sm)); } ASSERT(vd->vdev_obsolete_segments != NULL); /* * Since frees / remaps to an indirect vdev can only * happen in syncing context, the obsolete segments * tree must be empty when we start syncing. */ ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); } /* * Set the top-level vdev's max queue depth. Evaluate each top-level's * async write queue depth in case it changed. The max queue depth will * not change in the middle of syncing out this txg. */ static void spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) { ASSERT(spa_writeable(spa)); vdev_t *rvd = spa->spa_root_vdev; uint32_t max_queue_depth = zfs_vdev_async_write_max_active * zfs_vdev_queue_depth_pct / 100; metaslab_class_t *normal = spa_normal_class(spa); metaslab_class_t *special = spa_special_class(spa); metaslab_class_t *dedup = spa_dedup_class(spa); uint64_t slots_per_allocator = 0; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (mg == NULL || !metaslab_group_initialized(mg)) continue; metaslab_class_t *mc = mg->mg_class; if (mc != normal && mc != special && mc != dedup) continue; /* * It is safe to do a lock-free check here because only async * allocations look at mg_max_alloc_queue_depth, and async * allocations all happen from spa_sync(). */ for (int i = 0; i < mg->mg_allocators; i++) { ASSERT0(zfs_refcount_count( &(mg->mg_allocator[i].mga_alloc_queue_depth))); } mg->mg_max_alloc_queue_depth = max_queue_depth; for (int i = 0; i < mg->mg_allocators; i++) { mg->mg_allocator[i].mga_cur_max_alloc_queue_depth = zfs_vdev_def_queue_depth; } slots_per_allocator += zfs_vdev_def_queue_depth; } for (int i = 0; i < spa->spa_alloc_count; i++) { ASSERT0(zfs_refcount_count(&normal->mc_allocator[i]. mca_alloc_slots)); ASSERT0(zfs_refcount_count(&special->mc_allocator[i]. mca_alloc_slots)); ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i]. mca_alloc_slots)); normal->mc_allocator[i].mca_alloc_max_slots = slots_per_allocator; special->mc_allocator[i].mca_alloc_max_slots = slots_per_allocator; dedup->mc_allocator[i].mca_alloc_max_slots = slots_per_allocator; } normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; } static void spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) { ASSERT(spa_writeable(spa)); vdev_t *rvd = spa->spa_root_vdev; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_indirect_state_sync_verify(vd); if (vdev_indirect_should_condense(vd)) { spa_condense_indirect_start_sync(vd, tx); break; } } } static void spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) { objset_t *mos = spa->spa_meta_objset; dsl_pool_t *dp = spa->spa_dsl_pool; uint64_t txg = tx->tx_txg; bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; do { int pass = ++spa->spa_sync_pass; spa_sync_config_object(spa, tx); spa_sync_aux_dev(spa, &spa->spa_spares, tx, ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); if (pass < zfs_sync_pass_deferred_free || spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { /* * If the log space map feature is active we don't * care about deferred frees and the deferred bpobj * as the log space map should effectively have the * same results (i.e. appending only to one object). */ spa_sync_frees(spa, free_bpl, tx); } else { /* * We can not defer frees in pass 1, because * we sync the deferred frees later in pass 1. */ ASSERT3U(pass, >, 1); bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, &spa->spa_deferred_bpobj, tx); } brt_sync(spa, txg); ddt_sync(spa, txg); dsl_scan_sync(dp, tx); dsl_errorscrub_sync(dp, tx); svr_sync(spa, tx); spa_sync_upgrades(spa, tx); spa_flush_metaslabs(spa, tx); vdev_t *vd = NULL; while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) != NULL) vdev_sync(vd, txg); /* * Note: We need to check if the MOS is dirty because we could * have marked the MOS dirty without updating the uberblock * (e.g. if we have sync tasks but no dirty user data). We need * to check the uberblock's rootbp because it is updated if we * have synced out dirty data (though in this case the MOS will * most likely also be dirty due to second order effects, we * don't want to rely on that here). */ if (pass == 1 && spa->spa_uberblock.ub_rootbp.blk_birth < txg && !dmu_objset_is_dirty(mos, txg)) { /* * Nothing changed on the first pass, therefore this * TXG is a no-op. Avoid syncing deferred frees, so * that we can keep this TXG as a no-op. */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); break; } spa_sync_deferred_frees(spa, tx); } while (dmu_objset_is_dirty(mos, txg)); } /* * Rewrite the vdev configuration (which includes the uberblock) to * commit the transaction group. * * If there are no dirty vdevs, we sync the uberblock to a few random * top-level vdevs that are known to be visible in the config cache * (see spa_vdev_add() for a complete description). If there *are* dirty * vdevs, sync the uberblock to all vdevs. */ static void spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg = tx->tx_txg; for (;;) { int error = 0; /* * We hold SCL_STATE to prevent vdev open/close/etc. * while we're attempting to write the vdev labels. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (list_is_empty(&spa->spa_config_dirty_list)) { vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; int c0 = random_in_range(children); for (int c = 0; c < children; c++) { vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; /* Stop when revisiting the first vdev */ if (c > 0 && svd[0] == vd) break; if (vd->vdev_ms_array == 0 || vd->vdev_islog || !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_SYNC_MIN_VDEVS) break; } error = vdev_config_sync(svd, svdcount, txg); } else { error = vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg); } if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_STATE, FTAG); if (error == 0) break; zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); zio_resume_wait(spa); } } /* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. */ void spa_sync(spa_t *spa, uint64_t txg) { vdev_t *vd = NULL; VERIFY(spa_writeable(spa)); /* * Wait for i/os issued in open context that need to complete * before this txg syncs. */ (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* * Now that there can be no more cloning in this transaction group, * but we are still before issuing frees, we can process pending BRT * updates. */ brt_pending_apply(spa, txg); /* * Lock out configuration changes. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_enter(&spa->spa_allocs[i].spaa_lock); VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); mutex_exit(&spa->spa_allocs[i].spaa_lock); } /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { /* Avoid holding the write lock unless actually necessary */ if (vd->vdev_aux == NULL) { vdev_state_clean(vd); vdev_config_dirty(vd); continue; } /* * We need the write lock here because, for aux vdevs, * calling vdev_config_dirty() modifies sav_config. * This is ugly and will become unnecessary when we * eliminate the aux vdev wart by integrating all vdevs * into the root vdev tree. */ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { vdev_state_clean(vd); vdev_config_dirty(vd); } spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } spa_config_exit(spa, SCL_STATE, FTAG); dsl_pool_t *dp = spa->spa_dsl_pool; dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); spa->spa_sync_starttime = gethrtime(); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + NSEC_TO_TICK(spa->spa_deadman_synctime)); /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. */ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { vdev_t *rvd = spa->spa_root_vdev; int i; for (i = 0; i < rvd->vdev_children; i++) { vd = rvd->vdev_child[i]; if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) break; } if (i == rvd->vdev_children) { spa->spa_deflate = TRUE; VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx)); } } spa_sync_adjust_vdev_max_queue_depth(spa); spa_sync_condense_indirect(spa, tx); spa_sync_iterate_to_convergence(spa, tx); #ifdef ZFS_DEBUG if (!list_is_empty(&spa->spa_config_dirty_list)) { /* * Make sure that the number of ZAPs for all the vdevs matches * the number of ZAPs in the per-vdev ZAP list. This only gets * called if the config is dirty; otherwise there may be * outstanding AVZ operations that weren't completed in * spa_sync_config_object. */ uint64_t all_vdev_zap_entry_count; ASSERT0(zap_count(spa->spa_meta_objset, spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, all_vdev_zap_entry_count); } #endif if (spa->spa_vdev_removal != NULL) { ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); } spa_sync_rewrite_vdev_config(spa, tx); dmu_tx_commit(tx); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); spa->spa_deadman_tqid = 0; /* * Clear the dirty config list. */ while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) vdev_config_clean(vd); /* * Now that the new config has synced transactionally, * let it become visible to the config cache. */ if (spa->spa_config_syncing != NULL) { spa_config_set(spa, spa->spa_config_syncing); spa->spa_config_txg = txg; spa->spa_config_syncing = NULL; } dsl_pool_sync_done(dp, txg); for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_enter(&spa->spa_allocs[i].spaa_lock); VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); mutex_exit(&spa->spa_allocs[i].spaa_lock); } /* * Update usable space statistics. */ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) != NULL) vdev_sync_done(vd, txg); metaslab_class_evict_old(spa->spa_normal_class, txg); metaslab_class_evict_old(spa->spa_log_class, txg); spa_sync_close_syncing_log_sm(spa); spa_update_dspace(spa); if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) vdev_autotrim_kick(spa); /* * It had better be the case that we didn't dirty anything * since vdev_config_sync(). */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); while (zfs_pause_spa_sync) delay(1); spa->spa_sync_pass = 0; /* * Update the last synced uberblock here. We want to do this at * the end of spa_sync() so that consumers of spa_last_synced_txg() * will be guaranteed that all the processing associated with * that txg has been completed. */ spa->spa_ubsync = spa->spa_uberblock; spa_config_exit(spa, SCL_CONFIG, FTAG); spa_handle_ignored_writes(spa); /* * If any async tasks have been requested, kick them off. */ spa_async_dispatch(spa); } /* * Sync all pools. We don't want to hold the namespace lock across these * operations, so we take a reference on the spa_t and drop the lock during the * sync. */ void spa_sync_allpools(void) { spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (spa_state(spa) != POOL_STATE_ACTIVE || !spa_writeable(spa) || spa_suspended(spa)) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); } mutex_exit(&spa_namespace_lock); } /* * ========================================================================== * Miscellaneous routines * ========================================================================== */ /* * Remove all pools in the system. */ void spa_evict_all(void) { spa_t *spa; /* * Remove all cached state. All pools should be closed now, * so every spa in the AVL tree should be unreferenced. */ mutex_enter(&spa_namespace_lock); while ((spa = spa_next(NULL)) != NULL) { /* * Stop async tasks. The async thread may need to detach * a device that's been replaced, which requires grabbing * spa_namespace_lock, so we must drop it here. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } spa_remove(spa); } mutex_exit(&spa_namespace_lock); } vdev_t * spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) { vdev_t *vd; int i; if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) return (vd); if (aux) { for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vd = spa->spa_l2cache.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } } return (NULL); } void spa_upgrade(spa_t *spa, uint64_t version) { ASSERT(spa_writeable(spa)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * This should only be called for a non-faulted pool, and since a * future version would result in an unopenable pool, this shouldn't be * possible. */ ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); ASSERT3U(version, >=, spa->spa_uberblock.ub_version); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); txg_wait_synced(spa_get_dsl(spa), 0); } static boolean_t spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav) { (void) spa; int i; uint64_t vdev_guid; for (i = 0; i < sav->sav_count; i++) if (sav->sav_vdevs[i]->vdev_guid == guid) return (B_TRUE); for (i = 0; i < sav->sav_npending; i++) { if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, &vdev_guid) == 0 && vdev_guid == guid) return (B_TRUE); } return (B_FALSE); } boolean_t spa_has_l2cache(spa_t *spa, uint64_t guid) { return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache)); } boolean_t spa_has_spare(spa_t *spa, uint64_t guid) { return (spa_has_aux_vdev(spa, guid, &spa->spa_spares)); } /* * Check if a pool has an active shared spare device. * Note: reference count of an active spare is 2, as a spare and as a replace */ static boolean_t spa_has_active_shared_spare(spa_t *spa) { int i, refcnt; uint64_t pool; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) { if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, &refcnt) && pool != 0ULL && pool == spa_guid(spa) && refcnt > 2) return (B_TRUE); } return (B_FALSE); } uint64_t spa_total_metaslabs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; uint64_t m = 0; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; if (!vdev_is_concrete(vd)) continue; m += vd->vdev_ms_count; } return (m); } /* * Notify any waiting threads that some activity has switched from being in- * progress to not-in-progress so that the thread can wake up and determine * whether it is finished waiting. */ void spa_notify_waiters(spa_t *spa) { /* * Acquiring spa_activities_lock here prevents the cv_broadcast from * happening between the waiting thread's check and cv_wait. */ mutex_enter(&spa->spa_activities_lock); cv_broadcast(&spa->spa_activities_cv); mutex_exit(&spa->spa_activities_lock); } /* * Notify any waiting threads that the pool is exporting, and then block until * they are finished using the spa_t. */ void spa_wake_waiters(spa_t *spa) { mutex_enter(&spa->spa_activities_lock); spa->spa_waiters_cancel = B_TRUE; cv_broadcast(&spa->spa_activities_cv); while (spa->spa_waiters != 0) cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); spa->spa_waiters_cancel = B_FALSE; mutex_exit(&spa->spa_activities_lock); } /* Whether the vdev or any of its descendants are being initialized/trimmed. */ static boolean_t spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); ASSERT(activity == ZPOOL_WAIT_INITIALIZE || activity == ZPOOL_WAIT_TRIM); kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? &vd->vdev_initialize_lock : &vd->vdev_trim_lock; mutex_exit(&spa->spa_activities_lock); mutex_enter(lock); mutex_enter(&spa->spa_activities_lock); boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); mutex_exit(lock); if (in_progress) return (B_TRUE); for (int i = 0; i < vd->vdev_children; i++) { if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], activity)) return (B_TRUE); } return (B_FALSE); } /* * If use_guid is true, this checks whether the vdev specified by guid is * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool * is being initialized/trimmed. The caller must hold the config lock and * spa_activities_lock. */ static int spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, zpool_wait_activity_t activity, boolean_t *in_progress) { mutex_exit(&spa->spa_activities_lock); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); mutex_enter(&spa->spa_activities_lock); vdev_t *vd; if (use_guid) { vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (EINVAL); } } else { vd = spa->spa_root_vdev; } *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (0); } /* * Locking for waiting threads * --------------------------- * * Waiting threads need a way to check whether a given activity is in progress, * and then, if it is, wait for it to complete. Each activity will have some * in-memory representation of the relevant on-disk state which can be used to * determine whether or not the activity is in progress. The in-memory state and * the locking used to protect it will be different for each activity, and may * not be suitable for use with a cvar (e.g., some state is protected by the * config lock). To allow waiting threads to wait without any races, another * lock, spa_activities_lock, is used. * * When the state is checked, both the activity-specific lock (if there is one) * and spa_activities_lock are held. In some cases, the activity-specific lock * is acquired explicitly (e.g. the config lock). In others, the locking is * internal to some check (e.g. bpobj_is_empty). After checking, the waiting * thread releases the activity-specific lock and, if the activity is in * progress, then cv_waits using spa_activities_lock. * * The waiting thread is woken when another thread, one completing some * activity, updates the state of the activity and then calls * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only * needs to hold its activity-specific lock when updating the state, and this * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. * * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, * and because it is held when the waiting thread checks the state of the * activity, it can never be the case that the completing thread both updates * the activity state and cv_broadcasts in between the waiting thread's check * and cv_wait. Thus, a waiting thread can never miss a wakeup. * * In order to prevent deadlock, when the waiting thread does its check, in some * cases it will temporarily drop spa_activities_lock in order to acquire the * activity-specific lock. The order in which spa_activities_lock and the * activity specific lock are acquired in the waiting thread is determined by * the order in which they are acquired in the completing thread; if the * completing thread calls spa_notify_waiters with the activity-specific lock * held, then the waiting thread must also acquire the activity-specific lock * first. */ static int spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, boolean_t use_tag, uint64_t tag, boolean_t *in_progress) { int error = 0; ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); switch (activity) { case ZPOOL_WAIT_CKPT_DISCARD: *in_progress = (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == ENOENT); break; case ZPOOL_WAIT_FREE: *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || spa_livelist_delete_check(spa)); break; case ZPOOL_WAIT_INITIALIZE: case ZPOOL_WAIT_TRIM: error = spa_vdev_activity_in_progress(spa, use_tag, tag, activity, in_progress); break; case ZPOOL_WAIT_REPLACE: mutex_exit(&spa->spa_activities_lock); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); mutex_enter(&spa->spa_activities_lock); *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); break; case ZPOOL_WAIT_REMOVE: *in_progress = (spa->spa_removing_phys.sr_state == DSS_SCANNING); break; case ZPOOL_WAIT_RESILVER: if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) break; zfs_fallthrough; case ZPOOL_WAIT_SCRUB: { boolean_t scanning, paused, is_scrub; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); scanning = (scn->scn_phys.scn_state == DSS_SCANNING); paused = dsl_scan_is_paused_scrub(scn); *in_progress = (scanning && !paused && is_scrub == (activity == ZPOOL_WAIT_SCRUB)); break; } default: panic("unrecognized value for activity %d", activity); } return (error); } static int spa_wait_common(const char *pool, zpool_wait_activity_t activity, boolean_t use_tag, uint64_t tag, boolean_t *waited) { /* * The tag is used to distinguish between instances of an activity. * 'initialize' and 'trim' are the only activities that we use this for. * The other activities can only have a single instance in progress in a * pool at one time, making the tag unnecessary. * * There can be multiple devices being replaced at once, but since they * all finish once resilvering finishes, we don't bother keeping track * of them individually, we just wait for them all to finish. */ if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && activity != ZPOOL_WAIT_TRIM) return (EINVAL); if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) return (EINVAL); spa_t *spa; int error = spa_open(pool, &spa, FTAG); if (error != 0) return (error); /* * Increment the spa's waiter count so that we can call spa_close and * still ensure that the spa_t doesn't get freed before this thread is * finished with it when the pool is exported. We want to call spa_close * before we start waiting because otherwise the additional ref would * prevent the pool from being exported or destroyed throughout the * potentially long wait. */ mutex_enter(&spa->spa_activities_lock); spa->spa_waiters++; spa_close(spa, FTAG); *waited = B_FALSE; for (;;) { boolean_t in_progress; error = spa_activity_in_progress(spa, activity, use_tag, tag, &in_progress); if (error || !in_progress || spa->spa_waiters_cancel) break; *waited = B_TRUE; if (cv_wait_sig(&spa->spa_activities_cv, &spa->spa_activities_lock) == 0) { error = EINTR; break; } } spa->spa_waiters--; cv_signal(&spa->spa_waiters_cv); mutex_exit(&spa->spa_activities_lock); return (error); } /* * Wait for a particular instance of the specified activity to complete, where * the instance is identified by 'tag' */ int spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, boolean_t *waited) { return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); } /* * Wait for all instances of the specified activity complete */ int spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) { return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); } sysevent_t * spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { sysevent_t *ev = NULL; #ifdef _KERNEL nvlist_t *resource; resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); if (resource) { ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); ev->resource = resource; } #else (void) spa, (void) vd, (void) hist_nvl, (void) name; #endif return (ev); } void spa_event_post(sysevent_t *ev) { #ifdef _KERNEL if (ev) { zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); kmem_free(ev, sizeof (*ev)); } #else (void) ev; #endif } /* * Post a zevent corresponding to the given sysevent. The 'name' must be one * of the event definitions in sys/sysevent/eventdefs.h. The payload will be * filled in from the spa and (optionally) the vdev. This doesn't do anything * in the userland libzpool, as we don't want consumers to misinterpret ztest * or zdb as real changes. */ void spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); } /* state manipulation functions */ EXPORT_SYMBOL(spa_open); EXPORT_SYMBOL(spa_open_rewind); EXPORT_SYMBOL(spa_get_stats); EXPORT_SYMBOL(spa_create); EXPORT_SYMBOL(spa_import); EXPORT_SYMBOL(spa_tryimport); EXPORT_SYMBOL(spa_destroy); EXPORT_SYMBOL(spa_export); EXPORT_SYMBOL(spa_reset); EXPORT_SYMBOL(spa_async_request); EXPORT_SYMBOL(spa_async_suspend); EXPORT_SYMBOL(spa_async_resume); EXPORT_SYMBOL(spa_inject_addref); EXPORT_SYMBOL(spa_inject_delref); EXPORT_SYMBOL(spa_scan_stat_init); EXPORT_SYMBOL(spa_scan_get_stats); /* device manipulation */ EXPORT_SYMBOL(spa_vdev_add); EXPORT_SYMBOL(spa_vdev_attach); EXPORT_SYMBOL(spa_vdev_detach); EXPORT_SYMBOL(spa_vdev_setpath); EXPORT_SYMBOL(spa_vdev_setfru); EXPORT_SYMBOL(spa_vdev_split_mirror); /* spare statech is global across all pools) */ EXPORT_SYMBOL(spa_spare_add); EXPORT_SYMBOL(spa_spare_remove); EXPORT_SYMBOL(spa_spare_exists); EXPORT_SYMBOL(spa_spare_activate); /* L2ARC statech is global across all pools) */ EXPORT_SYMBOL(spa_l2cache_add); EXPORT_SYMBOL(spa_l2cache_remove); EXPORT_SYMBOL(spa_l2cache_exists); EXPORT_SYMBOL(spa_l2cache_activate); EXPORT_SYMBOL(spa_l2cache_drop); /* scanning */ EXPORT_SYMBOL(spa_scan); EXPORT_SYMBOL(spa_scan_stop); /* spa syncing */ EXPORT_SYMBOL(spa_sync); /* only for DMU use */ EXPORT_SYMBOL(spa_sync_allpools); /* properties */ EXPORT_SYMBOL(spa_prop_set); EXPORT_SYMBOL(spa_prop_get); EXPORT_SYMBOL(spa_prop_clear_bootfs); /* asynchronous event notification */ EXPORT_SYMBOL(spa_event_notify); +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW, + "Percentage of CPUs to run a metaslab preload taskq"); + /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW, "log2 fraction of arc that can be used by inflight I/Os when " "verifying pool during import"); /* END CSTYLED */ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, "Set to traverse metadata on pool import"); ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, "Set to traverse data on pool import"); ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, "Print vdev tree to zfs_dbgmsg during pool import"); ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, "Percentage of CPUs to run an IO worker thread"); ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, "Number of threads per IO worker taskqueue"); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, "Allow importing pool with up to this number of missing top-level " "vdevs (in read-only mode)"); /* END CSTYLED */ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW, "Set the livelist condense zthr to pause"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, ZMOD_RW, "Set the livelist condense synctask to pause"); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, INT, ZMOD_RW, "Whether livelist condensing was canceled in the synctask"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT, ZMOD_RW, "Whether livelist condensing was canceled in the zthr function"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c index 9e9c9c22549d..18c6cbf028b3 100644 --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -1,4228 +1,4233 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2018 Datto Inc. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system * calls that change the file system. Each itx has enough information to * be able to replay them after a system crash, power loss, or * equivalent failure mode. These are stored in memory until either: * * 1. they are committed to the pool by the DMU transaction group * (txg), at which point they can be discarded; or * 2. they are committed to the on-disk ZIL for the dataset being * modified (e.g. due to an fsync, O_DSYNC, or other synchronous * requirement). * * In the event of a crash or power loss, the itxs contained by each * dataset's on-disk ZIL will be replayed when that dataset is first * instantiated (e.g. if the dataset is a normal filesystem, when it is * first mounted). * * As hinted at above, there is one ZIL per dataset (both the in-memory * representation, and the on-disk representation). The on-disk format * consists of 3 parts: * * - a single, per-dataset, ZIL header; which points to a chain of * - zero or more ZIL blocks; each of which contains * - zero or more ZIL records * * A ZIL record holds the information necessary to replay a single * system call transaction. A ZIL block can hold many ZIL records, and * the blocks are chained together, similarly to a singly linked list. * * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL * block in the chain, and the ZIL header points to the first block in * the chain. * * Note, there is not a fixed place in the pool to hold these ZIL * blocks; they are dynamically allocated and freed as needed from the * blocks available on the pool, though they can be preferentially * allocated from a dedicated "log" vdev. */ /* * This controls the amount of time that a ZIL block (lwb) will remain * "open" when it isn't "full", and it has a thread waiting for it to be * committed to stable storage. Please refer to the zil_commit_waiter() * function (and the comments within it) for more details. */ static uint_t zfs_commit_timeout_pct = 5; /* * Minimal time we care to delay commit waiting for more ZIL records. * At least FreeBSD kernel can't sleep for less than 2us at its best. * So requests to sleep for less then 5us is a waste of CPU time with * a risk of significant log latency increase due to oversleep. */ static uint64_t zil_min_commit_timeout = 5000; /* * See zil.h for more information about these fields. */ static zil_kstat_values_t zil_stats = { { "zil_commit_count", KSTAT_DATA_UINT64 }, { "zil_commit_writer_count", KSTAT_DATA_UINT64 }, { "zil_itx_count", KSTAT_DATA_UINT64 }, { "zil_itx_indirect_count", KSTAT_DATA_UINT64 }, { "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_copied_count", KSTAT_DATA_UINT64 }, { "zil_itx_copied_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_needcopy_count", KSTAT_DATA_UINT64 }, { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 }, }; static zil_sums_t zil_sums_global; static kstat_t *zil_kstats_global; /* * Disable intent logging replay. This global ZIL switch affects all pools. */ int zil_replay_disable = 0; /* * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to * the disk(s) by the ZIL after an LWB write has completed. Setting this * will cause ZIL corruption on power loss if a volatile out-of-order * write cache is enabled. */ static int zil_nocacheflush = 0; /* * Limit SLOG write size per commit executed with synchronous priority. * Any writes above that will be executed with lower (asynchronous) priority * to limit potential SLOG device abuse by single active ZIL writer. */ static uint64_t zil_slog_bulk = 768 * 1024; static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); static itx_t *zil_itx_clone(itx_t *oitx); static int zil_bp_compare(const void *x1, const void *x2) { const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); if (likely(cmp)) return (cmp); return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); } static void zil_bp_tree_init(zilog_t *zilog) { avl_create(&zilog->zl_bp_tree, zil_bp_compare, sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); } static void zil_bp_tree_fini(zilog_t *zilog) { avl_tree_t *t = &zilog->zl_bp_tree; zil_bp_node_t *zn; void *cookie = NULL; while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(zn, sizeof (zil_bp_node_t)); avl_destroy(t); } int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) { avl_tree_t *t = &zilog->zl_bp_tree; const dva_t *dva; zil_bp_node_t *zn; avl_index_t where; if (BP_IS_EMBEDDED(bp)) return (0); dva = BP_IDENTITY(bp); if (avl_find(t, dva, &where) != NULL) return (SET_ERROR(EEXIST)); zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); zn->zn_dva = *dva; avl_insert(t, zn, where); return (0); } static zil_header_t * zil_header_in_syncing_context(zilog_t *zilog) { return ((zil_header_t *)zilog->zl_header); } static void zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) { zio_cksum_t *zc = &bp->blk_cksum; (void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_0], sizeof (zc->zc_word[ZIL_ZC_GUID_0])); (void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_1], sizeof (zc->zc_word[ZIL_ZC_GUID_1])); zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); zc->zc_word[ZIL_ZC_SEQ] = 1ULL; } static int zil_kstats_global_update(kstat_t *ksp, int rw) { zil_kstat_values_t *zs = ksp->ks_data; ASSERT3P(&zil_stats, ==, zs); if (rw == KSTAT_WRITE) { return (SET_ERROR(EACCES)); } zil_kstat_values_update(zs, &zil_sums_global); return (0); } /* * Read a log block and make sure it's valid. */ static int zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf) { zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; arc_flags_t aflags = ARC_FLAG_WAIT; zbookmark_phys_t zb; int error; if (zilog->zl_header->zh_claim_txg == 0) zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) zio_flags |= ZIO_FLAG_SPECULATIVE; if (!decrypt) zio_flags |= ZIO_FLAG_RAW; SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { zio_cksum_t cksum = bp->blk_cksum; /* * Validate the checksummed log block. * * Sequence numbers should be... sequential. The checksum * verifier for the next block should be bp's checksum plus 1. * * Also check the log chain linkage and size used. */ cksum.zc_word[ZIL_ZC_SEQ]++; uint64_t size = BP_GET_LSIZE(bp); if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = (*abuf)->b_data; char *lr = (char *)(zilc + 1); if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, sizeof (cksum)) || zilc->zc_nused < sizeof (*zilc) || zilc->zc_nused > size) { error = SET_ERROR(ECKSUM); } else { *begin = lr; *end = lr + zilc->zc_nused - sizeof (*zilc); *nbp = zilc->zc_next_blk; } } else { char *lr = (*abuf)->b_data; zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, sizeof (cksum)) || (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { *begin = lr; *end = lr + zilc->zc_nused; *nbp = zilc->zc_next_blk; } } } return (error); } /* * Read a TX_WRITE log data block. */ static int zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) { zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; const blkptr_t *bp = &lr->lr_blkptr; arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; if (BP_IS_HOLE(bp)) { if (wbuf != NULL) memset(wbuf, 0, MAX(BP_GET_LSIZE(bp), lr->lr_length)); return (0); } if (zilog->zl_header->zh_claim_txg == 0) zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; /* * If we are not using the resulting data, we are just checking that * it hasn't been corrupted so we don't need to waste CPU time * decompressing and decrypting it. */ if (wbuf == NULL) zio_flags |= ZIO_FLAG_RAW; ASSERT3U(BP_GET_LSIZE(bp), !=, 0); SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { if (wbuf != NULL) memcpy(wbuf, abuf->b_data, arc_buf_size(abuf)); arc_buf_destroy(abuf, &abuf); } return (error); } void zil_sums_init(zil_sums_t *zs) { wmsum_init(&zs->zil_commit_count, 0); wmsum_init(&zs->zil_commit_writer_count, 0); wmsum_init(&zs->zil_itx_count, 0); wmsum_init(&zs->zil_itx_indirect_count, 0); wmsum_init(&zs->zil_itx_indirect_bytes, 0); wmsum_init(&zs->zil_itx_copied_count, 0); wmsum_init(&zs->zil_itx_copied_bytes, 0); wmsum_init(&zs->zil_itx_needcopy_count, 0); wmsum_init(&zs->zil_itx_needcopy_bytes, 0); wmsum_init(&zs->zil_itx_metaslab_normal_count, 0); wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0); wmsum_init(&zs->zil_itx_metaslab_normal_write, 0); wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0); wmsum_init(&zs->zil_itx_metaslab_slog_count, 0); wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0); wmsum_init(&zs->zil_itx_metaslab_slog_write, 0); wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0); } void zil_sums_fini(zil_sums_t *zs) { wmsum_fini(&zs->zil_commit_count); wmsum_fini(&zs->zil_commit_writer_count); wmsum_fini(&zs->zil_itx_count); wmsum_fini(&zs->zil_itx_indirect_count); wmsum_fini(&zs->zil_itx_indirect_bytes); wmsum_fini(&zs->zil_itx_copied_count); wmsum_fini(&zs->zil_itx_copied_bytes); wmsum_fini(&zs->zil_itx_needcopy_count); wmsum_fini(&zs->zil_itx_needcopy_bytes); wmsum_fini(&zs->zil_itx_metaslab_normal_count); wmsum_fini(&zs->zil_itx_metaslab_normal_bytes); wmsum_fini(&zs->zil_itx_metaslab_normal_write); wmsum_fini(&zs->zil_itx_metaslab_normal_alloc); wmsum_fini(&zs->zil_itx_metaslab_slog_count); wmsum_fini(&zs->zil_itx_metaslab_slog_bytes); wmsum_fini(&zs->zil_itx_metaslab_slog_write); wmsum_fini(&zs->zil_itx_metaslab_slog_alloc); } void zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums) { zs->zil_commit_count.value.ui64 = wmsum_value(&zil_sums->zil_commit_count); zs->zil_commit_writer_count.value.ui64 = wmsum_value(&zil_sums->zil_commit_writer_count); zs->zil_itx_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_count); zs->zil_itx_indirect_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_indirect_count); zs->zil_itx_indirect_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_indirect_bytes); zs->zil_itx_copied_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_copied_count); zs->zil_itx_copied_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_copied_bytes); zs->zil_itx_needcopy_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_needcopy_count); zs->zil_itx_needcopy_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_needcopy_bytes); zs->zil_itx_metaslab_normal_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_count); zs->zil_itx_metaslab_normal_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes); zs->zil_itx_metaslab_normal_write.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_write); zs->zil_itx_metaslab_normal_alloc.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc); zs->zil_itx_metaslab_slog_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_count); zs->zil_itx_metaslab_slog_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes); zs->zil_itx_metaslab_slog_write.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_write); zs->zil_itx_metaslab_slog_alloc.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc); } /* * Parse the intent log, and call parse_func for each valid record within. */ int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg, boolean_t decrypt) { const zil_header_t *zh = zilog->zl_header; boolean_t claimed = !!zh->zh_claim_txg; uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; uint64_t max_blk_seq = 0; uint64_t max_lr_seq = 0; uint64_t blk_count = 0; uint64_t lr_count = 0; blkptr_t blk, next_blk = {{{{0}}}}; int error = 0; /* * Old logs didn't record the maximum zh_claim_lr_seq. */ if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) claim_lr_seq = UINT64_MAX; /* * Starting at the block pointed to by zh_log we read the log chain. * For each block in the chain we strongly check that block to * ensure its validity. We stop when an invalid block is found. * For each block pointer in the chain we call parse_blk_func(). * For each record in each valid block we call parse_lr_func(). * If the log has been claimed, stop if we encounter a sequence * number greater than the highest claimed sequence number. */ zil_bp_tree_init(zilog); for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; int reclen; char *lrp, *end; arc_buf_t *abuf = NULL; if (blk_seq > claim_blk_seq) break; error = parse_blk_func(zilog, &blk, arg, txg); if (error != 0) break; ASSERT3U(max_blk_seq, <, blk_seq); max_blk_seq = blk_seq; blk_count++; if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) break; error = zil_read_log_block(zilog, decrypt, &blk, &next_blk, &lrp, &end, &abuf); if (error != 0) { if (abuf) arc_buf_destroy(abuf, &abuf); if (claimed) { char name[ZFS_MAX_DATASET_NAME_LEN]; dmu_objset_name(zilog->zl_os, name); cmn_err(CE_WARN, "ZFS read log block error %d, " "dataset %s, seq 0x%llx\n", error, name, (u_longlong_t)blk_seq); } break; } for (; lrp < end; lrp += reclen) { lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); if (lr->lrc_seq > claim_lr_seq) { arc_buf_destroy(abuf, &abuf); goto done; } error = parse_lr_func(zilog, lr, arg, txg); if (error != 0) { arc_buf_destroy(abuf, &abuf); goto done; } ASSERT3U(max_lr_seq, <, lr->lrc_seq); max_lr_seq = lr->lrc_seq; lr_count++; } arc_buf_destroy(abuf, &abuf); } done: zilog->zl_parse_error = error; zilog->zl_parse_blk_seq = max_blk_seq; zilog->zl_parse_lr_seq = max_lr_seq; zilog->zl_parse_blk_count = blk_count; zilog->zl_parse_lr_count = lr_count; zil_bp_tree_fini(zilog); return (error); } static int zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, uint64_t first_txg) { (void) tx; ASSERT(!BP_IS_HOLE(bp)); /* * As we call this function from the context of a rewind to a * checkpoint, each ZIL block whose txg is later than the txg * that we rewind to is invalid. Thus, we return -1 so * zil_parse() doesn't attempt to read it. */ if (bp->blk_birth >= first_txg) return (-1); if (zil_bp_tree_add(zilog, bp) != 0) return (0); zio_free(zilog->zl_spa, first_txg, bp); return (0); } static int zil_noop_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) { (void) zilog, (void) lrc, (void) tx, (void) first_txg; return (0); } static int zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, uint64_t first_txg) { /* * Claim log block if not already committed and not already claimed. * If tx == NULL, just verify that the block is claimable. */ if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0) return (0); return (zio_wait(zio_claim(NULL, zilog->zl_spa, tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); } static int zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) { lr_write_t *lr = (lr_write_t *)lrc; int error; ASSERT(lrc->lrc_txtype == TX_WRITE); /* * If the block is not readable, don't claim it. This can happen * in normal operation when a log block is written to disk before * some of the dmu_sync() blocks it points to. In this case, the * transaction cannot have been committed to anyone (we would have * waited for all writes to be stable first), so it is semantically * correct to declare this the end of the log. */ if (lr->lr_blkptr.blk_birth >= first_txg) { error = zil_read_log_data(zilog, lr, NULL); if (error != 0) return (error); } return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); } static int zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) { const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc; const blkptr_t *bp; spa_t *spa; uint_t ii; ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE); if (tx == NULL) { return (0); } /* * XXX: Do we need to byteswap lr? */ spa = zilog->zl_spa; for (ii = 0; ii < lr->lr_nbps; ii++) { bp = &lr->lr_bps[ii]; /* * When data in embedded into BP there is no need to create * BRT entry as there is no data block. Just copy the BP as * it contains the data. */ if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { brt_pending_add(spa, bp, tx); } } return (0); } static int zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) { switch (lrc->lrc_txtype) { case TX_WRITE: return (zil_claim_write(zilog, lrc, tx, first_txg)); case TX_CLONE_RANGE: return (zil_claim_clone_range(zilog, lrc, tx)); default: return (0); } } static int zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, uint64_t claim_txg) { (void) claim_txg; zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); return (0); } static int zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg) { lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; ASSERT(lrc->lrc_txtype == TX_WRITE); /* * If we previously claimed it, we need to free it. */ if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) { zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); } return (0); } static int zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) { const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc; const blkptr_t *bp; spa_t *spa; uint_t ii; ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE); if (tx == NULL) { return (0); } spa = zilog->zl_spa; for (ii = 0; ii < lr->lr_nbps; ii++) { bp = &lr->lr_bps[ii]; if (!BP_IS_HOLE(bp)) { zio_free(spa, dmu_tx_get_txg(tx), bp); } } return (0); } static int zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg) { if (claim_txg == 0) { return (0); } switch (lrc->lrc_txtype) { case TX_WRITE: return (zil_free_write(zilog, lrc, tx, claim_txg)); case TX_CLONE_RANGE: return (zil_free_clone_range(zilog, lrc, tx)); default: return (0); } } static int zil_lwb_vdev_compare(const void *x1, const void *x2) { const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; return (TREE_CMP(v1, v2)); } /* * Allocate a new lwb. We may already have a block pointer for it, in which * case we get size and version from there. Or we may not yet, in which case * we choose them here and later make the block allocation match. */ static lwb_t * zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog, uint64_t txg, lwb_state_t state) { lwb_t *lwb; lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); lwb->lwb_zilog = zilog; if (bp) { lwb->lwb_blk = *bp; lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2); sz = BP_GET_LSIZE(bp); } else { BP_ZERO(&lwb->lwb_blk); lwb->lwb_slim = (spa_version(zilog->zl_spa) >= SPA_VERSION_SLIM_ZIL); } lwb->lwb_slog = slog; lwb->lwb_error = 0; if (lwb->lwb_slim) { lwb->lwb_nmax = sz; lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); } else { lwb->lwb_nmax = sz - sizeof (zil_chain_t); lwb->lwb_nused = lwb->lwb_nfilled = 0; } lwb->lwb_sz = sz; lwb->lwb_state = state; lwb->lwb_buf = zio_buf_alloc(sz); lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; lwb->lwb_root_zio = NULL; lwb->lwb_issued_timestamp = 0; lwb->lwb_issued_txg = 0; lwb->lwb_alloc_txg = txg; lwb->lwb_max_txg = 0; mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); if (state != LWB_STATE_NEW) zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); return (lwb); } static void zil_free_lwb(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_lock)); ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_FLUSH_DONE); ASSERT3P(lwb->lwb_child_zio, ==, NULL); ASSERT3P(lwb->lwb_write_zio, ==, NULL); ASSERT3P(lwb->lwb_root_zio, ==, NULL); ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa)); ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); VERIFY(list_is_empty(&lwb->lwb_itxs)); VERIFY(list_is_empty(&lwb->lwb_waiters)); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); /* * Clear the zilog's field to indicate this lwb is no longer * valid, and prevent use-after-free errors. */ if (zilog->zl_last_lwb_opened == lwb) zilog->zl_last_lwb_opened = NULL; kmem_cache_free(zil_lwb_cache, lwb); } /* * Called when we create in-memory log transactions so that we know * to cleanup the itxs at the end of spa_sync(). */ static void zilog_dirty(zilog_t *zilog, uint64_t txg) { dsl_pool_t *dp = zilog->zl_dmu_pool; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); ASSERT(spa_writeable(zilog->zl_spa)); if (ds->ds_is_snapshot) panic("dirtying snapshot!"); if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(ds->ds_dbuf, zilog); zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg); } } /* * Determine if the zil is dirty in the specified txg. Callers wanting to * ensure that the dirty state does not change must hold the itxg_lock for * the specified txg. Holding the lock will ensure that the zil cannot be * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current * state. */ static boolean_t __maybe_unused zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg) { dsl_pool_t *dp = zilog->zl_dmu_pool; if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK)) return (B_TRUE); return (B_FALSE); } /* * Determine if the zil is dirty. The zil is considered dirty if it has * any pending itx records that have not been cleaned by zil_clean(). */ static boolean_t zilog_is_dirty(zilog_t *zilog) { dsl_pool_t *dp = zilog->zl_dmu_pool; for (int t = 0; t < TXG_SIZE; t++) { if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t)) return (B_TRUE); } return (B_FALSE); } /* * Its called in zil_commit context (zil_process_commit_list()/zil_create()). * It activates SPA_FEATURE_ZILSAXATTR feature, if its enabled. * Check dsl_dataset_feature_is_active to avoid txg_wait_synced() on every * zil_commit. */ static void zil_commit_activate_saxattr_feature(zilog_t *zilog) { dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); uint64_t txg = 0; dmu_tx_t *tx = NULL; if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL && !dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) { tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_dataset_dirty(ds, tx); txg = dmu_tx_get_txg(tx); mutex_enter(&ds->ds_lock); ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] = (void *)B_TRUE; mutex_exit(&ds->ds_lock); dmu_tx_commit(tx); txg_wait_synced(zilog->zl_dmu_pool, txg); } } /* * Create an on-disk intent log. */ static lwb_t * zil_create(zilog_t *zilog) { const zil_header_t *zh = zilog->zl_header; lwb_t *lwb = NULL; uint64_t txg = 0; dmu_tx_t *tx = NULL; blkptr_t blk; int error = 0; boolean_t slog = FALSE; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); /* * Wait for any previous destroy to complete. */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); ASSERT(zh->zh_claim_txg == 0); ASSERT(zh->zh_replay_seq == 0); blk = zh->zh_log; /* * Allocate an initial log block if: * - there isn't one already * - the existing block is the wrong endianness */ if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); if (!BP_IS_HOLE(&blk)) { zio_free(zilog->zl_spa, txg, &blk); BP_ZERO(&blk); } error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk, ZIL_MIN_BLKSZ, &slog); if (error == 0) zil_init_log_chain(zilog, &blk); } /* * Allocate a log write block (lwb) for the first log block. */ if (error == 0) lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW); /* * If we just allocated the first log block, commit our transaction * and wait for zil_sync() to stuff the block pointer into zh_log. * (zh is part of the MOS, so we cannot modify it in open context.) */ if (tx != NULL) { /* * If "zilsaxattr" feature is enabled on zpool, then activate * it now when we're creating the ZIL chain. We can't wait with * this until we write the first xattr log record because we * need to wait for the feature activation to sync out. */ if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL) { mutex_enter(&ds->ds_lock); ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] = (void *)B_TRUE; mutex_exit(&ds->ds_lock); } dmu_tx_commit(tx); txg_wait_synced(zilog->zl_dmu_pool, txg); } else { /* * This branch covers the case where we enable the feature on a * zpool that has existing ZIL headers. */ zil_commit_activate_saxattr_feature(zilog); } IMPLY(spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL, dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)); ASSERT(error != 0 || memcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); IMPLY(error == 0, lwb != NULL); return (lwb); } /* * In one tx, free all log blocks and clear the log header. If keep_first * is set, then we're replaying a log with no content. We want to keep the * first block, however, so that the first synchronous transaction doesn't * require a txg_wait_synced() in zil_create(). We don't need to * txg_wait_synced() here either when keep_first is set, because both * zil_create() and zil_destroy() will wait for any in-progress destroys * to complete. * Return B_TRUE if there were any entries to replay. */ boolean_t zil_destroy(zilog_t *zilog, boolean_t keep_first) { const zil_header_t *zh = zilog->zl_header; lwb_t *lwb; dmu_tx_t *tx; uint64_t txg; /* * Wait for any previous destroy to complete. */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); zilog->zl_old_header = *zh; /* debugging aid */ if (BP_IS_HOLE(&zh->zh_log)) return (B_FALSE); tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); mutex_enter(&zilog->zl_lock); ASSERT3U(zilog->zl_destroy_txg, <, txg); zilog->zl_destroy_txg = txg; zilog->zl_keep_first = keep_first; if (!list_is_empty(&zilog->zl_lwb_list)) { ASSERT(zh->zh_claim_txg == 0); VERIFY(!keep_first); while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) { if (lwb->lwb_buf != NULL) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); if (!BP_IS_HOLE(&lwb->lwb_blk)) zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); } } else if (!keep_first) { zil_destroy_sync(zilog, tx); } mutex_exit(&zilog->zl_lock); dmu_tx_commit(tx); return (B_TRUE); } void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) { ASSERT(list_is_empty(&zilog->zl_lwb_list)); (void) zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE); } int zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) { dmu_tx_t *tx = txarg; zilog_t *zilog; uint64_t first_txg; zil_header_t *zh; objset_t *os; int error; error = dmu_objset_own_obj(dp, ds->ds_object, DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os); if (error != 0) { /* * EBUSY indicates that the objset is inconsistent, in which * case it can not have a ZIL. */ if (error != EBUSY) { cmn_err(CE_WARN, "can't open objset for %llu, error %u", (unsigned long long)ds->ds_object, error); } return (0); } zilog = dmu_objset_zil(os); zh = zil_header_in_syncing_context(zilog); ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa)); first_txg = spa_min_claim_txg(zilog->zl_spa); /* * If the spa_log_state is not set to be cleared, check whether * the current uberblock is a checkpoint one and if the current * header has been claimed before moving on. * * If the current uberblock is a checkpointed uberblock then * one of the following scenarios took place: * * 1] We are currently rewinding to the checkpoint of the pool. * 2] We crashed in the middle of a checkpoint rewind but we * did manage to write the checkpointed uberblock to the * vdev labels, so when we tried to import the pool again * the checkpointed uberblock was selected from the import * procedure. * * In both cases we want to zero out all the ZIL blocks, except * the ones that have been claimed at the time of the checkpoint * (their zh_claim_txg != 0). The reason is that these blocks * may be corrupted since we may have reused their locations on * disk after we took the checkpoint. * * We could try to set spa_log_state to SPA_LOG_CLEAR earlier * when we first figure out whether the current uberblock is * checkpointed or not. Unfortunately, that would discard all * the logs, including the ones that are claimed, and we would * leak space. */ if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR || (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && zh->zh_claim_txg == 0)) { if (!BP_IS_HOLE(&zh->zh_log)) { (void) zil_parse(zilog, zil_clear_log_block, zil_noop_log_record, tx, first_txg, B_FALSE); } BP_ZERO(&zh->zh_log); if (os->os_encrypted) os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; dsl_dataset_dirty(dmu_objset_ds(os), tx); dmu_objset_disown(os, B_FALSE, FTAG); return (0); } /* * If we are not rewinding and opening the pool normally, then * the min_claim_txg should be equal to the first txg of the pool. */ ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa)); /* * Claim all log blocks if we haven't already done so, and remember * the highest claimed sequence number. This ensures that if we can * read only part of the log now (e.g. due to a missing device), * but we can read the entire log later, we will not try to replay * or destroy beyond the last block we successfully claimed. */ ASSERT3U(zh->zh_claim_txg, <=, first_txg); if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { (void) zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, first_txg, B_FALSE); zh->zh_claim_txg = first_txg; zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) zh->zh_flags |= ZIL_REPLAY_NEEDED; zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; if (os->os_encrypted) os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; dsl_dataset_dirty(dmu_objset_ds(os), tx); } ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); dmu_objset_disown(os, B_FALSE, FTAG); return (0); } /* * Check the log by walking the log chain. * Checksum errors are ok as they indicate the end of the chain. * Any other error (no device or read failure) returns an error. */ int zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) { (void) dp; zilog_t *zilog; objset_t *os; blkptr_t *bp; int error; ASSERT(tx == NULL); error = dmu_objset_from_ds(ds, &os); if (error != 0) { cmn_err(CE_WARN, "can't open objset %llu, error %d", (unsigned long long)ds->ds_object, error); return (0); } zilog = dmu_objset_zil(os); bp = (blkptr_t *)&zilog->zl_header->zh_log; if (!BP_IS_HOLE(bp)) { vdev_t *vd; boolean_t valid = B_TRUE; /* * Check the first block and determine if it's on a log device * which may have been removed or faulted prior to loading this * pool. If so, there's no point in checking the rest of the * log as its content should have already been synced to the * pool. */ spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); if (vd->vdev_islog && vdev_is_dead(vd)) valid = vdev_log_state_valid(vd); spa_config_exit(os->os_spa, SCL_STATE, FTAG); if (!valid) return (0); /* * Check whether the current uberblock is checkpointed (e.g. * we are rewinding) and whether the current header has been * claimed or not. If it hasn't then skip verifying it. We * do this because its ZIL blocks may be part of the pool's * state before the rewind, which is no longer valid. */ zil_header_t *zh = zil_header_in_syncing_context(zilog); if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && zh->zh_claim_txg == 0) return (0); } /* * Because tx == NULL, zil_claim_log_block() will not actually claim * any blocks, but just determine whether it is possible to do so. * In addition to checking the log chain, zil_claim_log_block() * will invoke zio_claim() with a done func of spa_claim_notify(), * which will update spa_max_claim_txg. See spa_load() for details. */ error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, zilog->zl_header->zh_claim_txg ? -1ULL : spa_min_claim_txg(os->os_spa), B_FALSE); return ((error == ECKSUM || error == ENOENT) ? 0 : error); } /* * When an itx is "skipped", this function is used to properly mark the * waiter as "done, and signal any thread(s) waiting on it. An itx can * be skipped (and not committed to an lwb) for a variety of reasons, * one of them being that the itx was committed via spa_sync(), prior to * it being committed to an lwb; this can happen if a thread calling * zil_commit() is racing with spa_sync(). */ static void zil_commit_waiter_skip(zil_commit_waiter_t *zcw) { mutex_enter(&zcw->zcw_lock); ASSERT3B(zcw->zcw_done, ==, B_FALSE); zcw->zcw_done = B_TRUE; cv_broadcast(&zcw->zcw_cv); mutex_exit(&zcw->zcw_lock); } /* * This function is used when the given waiter is to be linked into an * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb. * At this point, the waiter will no longer be referenced by the itx, * and instead, will be referenced by the lwb. */ static void zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) { /* * The lwb_waiters field of the lwb is protected by the zilog's * zl_issuer_lock while the lwb is open and zl_lock otherwise. * zl_issuer_lock also protects leaving the open state. * zcw_lwb setting is protected by zl_issuer_lock and state != * flush_done, which transition is protected by zl_lock. */ ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock)); IMPLY(lwb->lwb_state != LWB_STATE_OPENED, MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); ASSERT(!list_link_active(&zcw->zcw_node)); list_insert_tail(&lwb->lwb_waiters, zcw); ASSERT3P(zcw->zcw_lwb, ==, NULL); zcw->zcw_lwb = lwb; } /* * This function is used when zio_alloc_zil() fails to allocate a ZIL * block, and the given waiter must be linked to the "nolwb waiters" * list inside of zil_process_commit_list(). */ static void zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) { ASSERT(!list_link_active(&zcw->zcw_node)); list_insert_tail(nolwb, zcw); ASSERT3P(zcw->zcw_lwb, ==, NULL); } void zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) { avl_tree_t *t = &lwb->lwb_vdev_tree; avl_index_t where; zil_vdev_node_t *zv, zvsearch; int ndvas = BP_GET_NDVAS(bp); int i; ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); if (zil_nocacheflush) return; mutex_enter(&lwb->lwb_vdev_lock); for (i = 0; i < ndvas; i++) { zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); if (avl_find(t, &zvsearch, &where) == NULL) { zv = kmem_alloc(sizeof (*zv), KM_SLEEP); zv->zv_vdev = zvsearch.zv_vdev; avl_insert(t, zv, where); } } mutex_exit(&lwb->lwb_vdev_lock); } static void zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb) { avl_tree_t *src = &lwb->lwb_vdev_tree; avl_tree_t *dst = &nlwb->lwb_vdev_tree; void *cookie = NULL; zil_vdev_node_t *zv; ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE); ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); /* * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does * not need the protection of lwb_vdev_lock (it will only be modified * while holding zilog->zl_lock) as its writes and those of its * children have all completed. The younger 'nlwb' may be waiting on * future writes to additional vdevs. */ mutex_enter(&nlwb->lwb_vdev_lock); /* * Tear down the 'lwb' vdev tree, ensuring that entries which do not * exist in 'nlwb' are moved to it, freeing any would-be duplicates. */ while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) { avl_index_t where; if (avl_find(dst, zv, &where) == NULL) { avl_insert(dst, zv, where); } else { kmem_free(zv, sizeof (*zv)); } } mutex_exit(&nlwb->lwb_vdev_lock); } void zil_lwb_add_txg(lwb_t *lwb, uint64_t txg) { lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); } /* * This function is a called after all vdevs associated with a given lwb * write have completed their DKIOCFLUSHWRITECACHE command; or as soon * as the lwb write completes, if "zil_nocacheflush" is set. Further, * all "previous" lwb's will have completed before this function is * called; i.e. this function is called for all previous lwbs before * it's called for "this" lwb (enforced via zio the dependencies * configured in zil_lwb_set_zio_dependency()). * * The intention is for this function to be called as soon as the * contents of an lwb are considered "stable" on disk, and will survive * any sudden loss of power. At this point, any threads waiting for the * lwb to reach this state are signalled, and the "waiter" structures * are marked "done". */ static void zil_lwb_flush_vdevs_done(zio_t *zio) { lwb_t *lwb = zio->io_private; zilog_t *zilog = lwb->lwb_zilog; zil_commit_waiter_t *zcw; itx_t *itx; spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp; mutex_enter(&zilog->zl_lock); zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 7 + t) / 8; lwb->lwb_root_zio = NULL; ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); lwb->lwb_state = LWB_STATE_FLUSH_DONE; if (zilog->zl_last_lwb_opened == lwb) { /* * Remember the highest committed log sequence number * for ztest. We only update this value when all the log * writes succeeded, because ztest wants to ASSERT that * it got the whole log chain. */ zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) zil_itx_destroy(itx); while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) { mutex_enter(&zcw->zcw_lock); ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; /* * We expect any ZIO errors from child ZIOs to have been * propagated "up" to this specific LWB's root ZIO, in * order for this error handling to work correctly. This * includes ZIO errors from either this LWB's write or * flush, as well as any errors from other dependent LWBs * (e.g. a root LWB ZIO that might be a child of this LWB). * * With that said, it's important to note that LWB flush * errors are not propagated up to the LWB root ZIO. * This is incorrect behavior, and results in VDEV flush * errors not being handled correctly here. See the * comment above the call to "zio_flush" for details. */ zcw->zcw_zio_error = zio->io_error; ASSERT3B(zcw->zcw_done, ==, B_FALSE); zcw->zcw_done = B_TRUE; cv_broadcast(&zcw->zcw_cv); mutex_exit(&zcw->zcw_lock); } uint64_t txg = lwb->lwb_issued_txg; /* Once we drop the lock, lwb may be freed by zil_sync(). */ mutex_exit(&zilog->zl_lock); mutex_enter(&zilog->zl_lwb_io_lock); ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0); zilog->zl_lwb_inflight[txg & TXG_MASK]--; if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0) cv_broadcast(&zilog->zl_lwb_io_cv); mutex_exit(&zilog->zl_lwb_io_lock); } /* * Wait for the completion of all issued write/flush of that txg provided. * It guarantees zil_lwb_flush_vdevs_done() is called and returned. */ static void zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg) { ASSERT3U(txg, ==, spa_syncing_txg(zilog->zl_spa)); mutex_enter(&zilog->zl_lwb_io_lock); while (zilog->zl_lwb_inflight[txg & TXG_MASK] > 0) cv_wait(&zilog->zl_lwb_io_cv, &zilog->zl_lwb_io_lock); mutex_exit(&zilog->zl_lwb_io_lock); #ifdef ZFS_DEBUG mutex_enter(&zilog->zl_lock); mutex_enter(&zilog->zl_lwb_io_lock); lwb_t *lwb = list_head(&zilog->zl_lwb_list); while (lwb != NULL) { if (lwb->lwb_issued_txg <= txg) { ASSERT(lwb->lwb_state != LWB_STATE_ISSUED); ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE); IMPLY(lwb->lwb_issued_txg > 0, lwb->lwb_state == LWB_STATE_FLUSH_DONE); } IMPLY(lwb->lwb_state == LWB_STATE_WRITE_DONE || lwb->lwb_state == LWB_STATE_FLUSH_DONE, lwb->lwb_buf == NULL); lwb = list_next(&zilog->zl_lwb_list, lwb); } mutex_exit(&zilog->zl_lwb_io_lock); mutex_exit(&zilog->zl_lock); #endif } /* * This is called when an lwb's write zio completes. The callback's * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved * in writing out this specific lwb's data, and in the case that cache * flushes have been deferred, vdevs involved in writing the data for * previous lwbs. The writes corresponding to all the vdevs in the * lwb_vdev_tree will have completed by the time this is called, due to * the zio dependencies configured in zil_lwb_set_zio_dependency(), * which takes deferred flushes into account. The lwb will be "done" * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio * completion callback for the lwb's root zio. */ static void zil_lwb_write_done(zio_t *zio) { lwb_t *lwb = zio->io_private; spa_t *spa = zio->io_spa; zilog_t *zilog = lwb->lwb_zilog; avl_tree_t *t = &lwb->lwb_vdev_tree; void *cookie = NULL; zil_vdev_node_t *zv; lwb_t *nlwb; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); abd_free(zio->io_abd); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); lwb->lwb_buf = NULL; mutex_enter(&zilog->zl_lock); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); lwb->lwb_state = LWB_STATE_WRITE_DONE; lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; /* * If nlwb is not yet issued, zil_lwb_set_zio_dependency() is not * called for it yet, and when it will be, it won't be able to make * its write ZIO a parent this ZIO. In such case we can not defer * our flushes or below may be a race between the done callbacks. */ nlwb = list_next(&zilog->zl_lwb_list, lwb); if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED) nlwb = NULL; mutex_exit(&zilog->zl_lock); if (avl_numnodes(t) == 0) return; /* * If there was an IO error, we're not going to call zio_flush() * on these vdevs, so we simply empty the tree and free the * nodes. We avoid calling zio_flush() since there isn't any * good reason for doing so, after the lwb block failed to be * written out. * * Additionally, we don't perform any further error handling at * this point (e.g. setting "zcw_zio_error" appropriately), as * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus, * we expect any error seen here, to have been propagated to * that function). */ if (zio->io_error != 0) { while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(zv, sizeof (*zv)); return; } /* * If this lwb does not have any threads waiting for it to * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE * command to the vdevs written to by "this" lwb, and instead * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE * command for those vdevs. Thus, we merge the vdev tree of * "this" lwb with the vdev tree of the "next" lwb in the list, * and assume the "next" lwb will handle flushing the vdevs (or * deferring the flush(s) again). * * This is a useful performance optimization, especially for * workloads with lots of async write activity and few sync * write and/or fsync activity, as it has the potential to * coalesce multiple flush commands to a vdev into one. */ if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) { zil_lwb_flush_defer(lwb, nlwb); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); return; } while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); if (vd != NULL && !vd->vdev_nowritecache) { /* * The "ZIO_FLAG_DONT_PROPAGATE" is currently * always used within "zio_flush". This means, * any errors when flushing the vdev(s), will * (unfortunately) not be handled correctly, * since these "zio_flush" errors will not be * propagated up to "zil_lwb_flush_vdevs_done". */ zio_flush(lwb->lwb_root_zio, vd); } kmem_free(zv, sizeof (*zv)); } } /* * Build the zio dependency chain, which is used to preserve the ordering of * lwb completions that is required by the semantics of the ZIL. Each new lwb * zio becomes a parent of the previous lwb zio, such that the new lwb's zio * cannot complete until the previous lwb's zio completes. * * This is required by the semantics of zil_commit(): the commit waiters * attached to the lwbs will be woken in the lwb zio's completion callback, * so this zio dependency graph ensures the waiters are woken in the correct * order (the same order the lwbs were created). */ static void zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_lock)); lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb); if (prev_lwb == NULL || prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE) return; /* * If the previous lwb's write hasn't already completed, we also want * to order the completion of the lwb write zios (above, we only order * the completion of the lwb root zios). This is required because of * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb. * * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous * lwb will rely on this lwb to flush the vdevs written to by that * previous lwb. Thus, we need to ensure this lwb doesn't issue the * flush until after the previous lwb's write completes. We ensure * this ordering by setting the zio parent/child relationship here. * * Without this relationship on the lwb's write zio, it's possible * for this lwb's write to complete prior to the previous lwb's write * completing; and thus, the vdevs for the previous lwb would be * flushed prior to that lwb's data being written to those vdevs (the * vdevs are flushed in the lwb write zio's completion handler, * zil_lwb_write_done()). */ if (prev_lwb->lwb_state == LWB_STATE_ISSUED) { ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL); zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio); } else { ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); } ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL); zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio); } /* * This function's purpose is to "open" an lwb such that it is ready to * accept new itxs being committed to it. This function is idempotent; if * the passed in lwb has already been opened, it is essentially a no-op. */ static void zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); if (lwb->lwb_state != LWB_STATE_NEW) { ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); return; } mutex_enter(&zilog->zl_lock); lwb->lwb_state = LWB_STATE_OPENED; zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); } /* * Define a limited set of intent log block sizes. * * These must be a multiple of 4KB. Note only the amount used (again * aligned to 4KB) actually gets written. However, we can't always just * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. */ static const struct { uint64_t limit; uint64_t blksz; } zil_block_buckets[] = { { 4096, 4096 }, /* non TX_WRITE */ { 8192 + 4096, 8192 + 4096 }, /* database */ { 32768 + 4096, 32768 + 4096 }, /* NFS writes */ { 65536 + 4096, 65536 + 4096 }, /* 64KB writes */ { 131072, 131072 }, /* < 128KB writes */ { 131072 +4096, 65536 + 4096 }, /* 128KB writes */ { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */ }; /* * Maximum block size used by the ZIL. This is picked up when the ZIL is * initialized. Otherwise this should not be used directly; see * zl_max_block_size instead. */ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; /* * Close the log block for being issued and allocate the next one. * Has to be called under zl_issuer_lock to chain more lwbs. */ static lwb_t * zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) { int i; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); lwb->lwb_state = LWB_STATE_CLOSED; /* * If there was an allocation failure then returned NULL will trigger * zil_commit_writer_stall() at the caller. This is inherently racy, * since allocation may not have happened yet. */ if (lwb->lwb_error != 0) return (NULL); /* * Log blocks are pre-allocated. Here we select the size of the next * block, based on size used in the last block. * - first find the smallest bucket that will fit the block from a * limited set of block sizes. This is because it's faster to write * blocks allocated from the same metaslab as they are adjacent or * close. * - next find the maximum from the new suggested size and an array of * previous sizes. This lessens a picket fence effect of wrongly * guessing the size if we have a stream of say 2k, 64k, 2k, 64k * requests. * * Note we only write what is used, but we can't just allocate * the maximum block size because we can exhaust the available * pool log space. */ uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++) continue; zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size); zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; for (i = 0; i < ZIL_PREV_BLKS; i++) zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, zil_blksz, uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state)); } /* * Finalize previously closed block and issue the write zio. */ static void zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) { spa_t *spa = zilog->zl_spa; zil_chain_t *zilc; boolean_t slog; zbookmark_phys_t zb; zio_priority_t prio; int error; ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); /* Actually fill the lwb with the data. */ for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; itx = list_next(&lwb->lwb_itxs, itx)) zil_lwb_commit(zilog, lwb, itx); lwb->lwb_nused = lwb->lwb_nfilled; lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); /* * The lwb is now ready to be issued, but it can be only if it already * got its block pointer allocated or the allocation has failed. * Otherwise leave it as-is, relying on some other thread to issue it * after allocating its block pointer via calling zil_lwb_write_issue() * for the previous lwb(s) in the chain. */ mutex_enter(&zilog->zl_lock); lwb->lwb_state = LWB_STATE_READY; if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) { mutex_exit(&zilog->zl_lock); return; } mutex_exit(&zilog->zl_lock); next_lwb: if (lwb->lwb_slim) zilc = (zil_chain_t *)lwb->lwb_buf; else zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax); int wsz = lwb->lwb_sz; if (lwb->lwb_error == 0) { abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz); if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) prio = ZIO_PRIORITY_SYNC_WRITE; else prio = ZIO_PRIORITY_ASYNC_WRITE; SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0, &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done, lwb, prio, ZIO_FLAG_CANFAIL, &zb); zil_lwb_add_block(lwb, &lwb->lwb_blk); if (lwb->lwb_slim) { /* For Slim ZIL only write what is used. */ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int); ASSERT3S(wsz, <=, lwb->lwb_sz); zio_shrink(lwb->lwb_write_zio, wsz); wsz = lwb->lwb_write_zio->io_size; } memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); zilc->zc_pad = 0; zilc->zc_nused = lwb->lwb_nused; zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; } else { /* * We can't write the lwb if there was an allocation failure, * so create a null zio instead just to maintain dependencies. */ lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL, zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL); lwb->lwb_write_zio->io_error = lwb->lwb_error; } if (lwb->lwb_child_zio) zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio); /* * Open transaction to allocate the next block pointer. */ dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); uint64_t txg = dmu_tx_get_txg(tx); /* * Allocate next the block pointer unless we are already in error. */ lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb); blkptr_t *bp = &zilc->zc_next_blk; BP_ZERO(bp); error = lwb->lwb_error; if (error == 0) { error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz, &slog); } if (error == 0) { ASSERT3U(bp->blk_birth, ==, txg); BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); bp->blk_cksum = lwb->lwb_blk.blk_cksum; bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; } /* * Reduce TXG open time by incrementing inflight counter and committing * the transaciton. zil_sync() will wait for it to return to zero. */ mutex_enter(&zilog->zl_lwb_io_lock); lwb->lwb_issued_txg = txg; zilog->zl_lwb_inflight[txg & TXG_MASK]++; zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg); mutex_exit(&zilog->zl_lwb_io_lock); dmu_tx_commit(tx); spa_config_enter(spa, SCL_STATE, lwb, RW_READER); /* * We've completed all potentially blocking operations. Update the * nlwb and allow it proceed without possible lock order reversals. */ mutex_enter(&zilog->zl_lock); zil_lwb_set_zio_dependency(zilog, lwb); lwb->lwb_state = LWB_STATE_ISSUED; if (nlwb) { nlwb->lwb_blk = *bp; nlwb->lwb_error = error; nlwb->lwb_slog = slog; nlwb->lwb_alloc_txg = txg; if (nlwb->lwb_state != LWB_STATE_READY) nlwb = NULL; } mutex_exit(&zilog->zl_lock); if (lwb->lwb_slog) { ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, lwb->lwb_nused); ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write, wsz); ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc, BP_GET_LSIZE(&lwb->lwb_blk)); } else { ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count); ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes, lwb->lwb_nused); ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write, wsz); ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc, BP_GET_LSIZE(&lwb->lwb_blk)); } lwb->lwb_issued_timestamp = gethrtime(); if (lwb->lwb_child_zio) zio_nowait(lwb->lwb_child_zio); zio_nowait(lwb->lwb_write_zio); zio_nowait(lwb->lwb_root_zio); /* * If nlwb was ready when we gave it the block pointer, * it is on us to issue it and possibly following ones. */ lwb = nlwb; if (lwb) goto next_lwb; } /* * Maximum amount of data that can be put into single log block. */ uint64_t zil_max_log_data(zilog_t *zilog, size_t hdrsize) { return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize); } /* * Maximum amount of log space we agree to waste to reduce number of - * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%). + * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~6%). */ static inline uint64_t zil_max_waste_space(zilog_t *zilog) { - return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 8); + return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 16); } /* * Maximum amount of write data for WR_COPIED. For correctness, consumers * must fall back to WR_NEED_COPY if we can't fit the entire record into one * maximum sized log block, because each WR_COPIED record must fit in a - * single log block. For space efficiency, we want to fit two records into a - * max-sized log block. + * single log block. Below that it is a tradeoff of additional memory copy + * and possibly worse log space efficiency vs additional range lock/unlock. */ +static uint_t zil_maxcopied = 7680; + uint64_t zil_max_copied_data(zilog_t *zilog) { - return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 - - sizeof (lr_write_t)); + uint64_t max_data = zil_max_log_data(zilog, sizeof (lr_write_t)); + return (MIN(max_data, zil_maxcopied)); } /* * Estimate space needed in the lwb for the itx. Allocate more lwbs or * split the itx as needed, but don't touch the actual transaction data. * Has to be called under zl_issuer_lock to call zil_lwb_write_close() * to chain more lwbs. */ static lwb_t * zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) { itx_t *citx; lr_t *lr, *clr; lr_write_t *lrw; uint64_t dlen, dnow, lwb_sp, reclen, max_log_data; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3P(lwb, !=, NULL); ASSERT3P(lwb->lwb_buf, !=, NULL); zil_lwb_write_open(zilog, lwb); lr = &itx->itx_lr; lrw = (lr_write_t *)lr; /* * A commit itx doesn't represent any on-disk state; instead * it's simply used as a place holder on the commit list, and * provides a mechanism for attaching a "commit waiter" onto the * correct lwb (such that the waiter can be signalled upon * completion of that lwb). Thus, we don't process this itx's * log record if it's a commit itx (these itx's don't have log * records), and instead link the itx's waiter onto the lwb's * list of waiters. * * For more details, see the comment above zil_commit(). */ if (lr->lrc_txtype == TX_COMMIT) { zil_commit_waiter_link_lwb(itx->itx_private, lwb); list_insert_tail(&lwb->lwb_itxs, itx); return (lwb); } if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); } else { dlen = 0; } reclen = lr->lrc_reclen; zilog->zl_cur_used += (reclen + dlen); cont: /* * If this record won't fit in the current log block, start a new one. * For WR_NEED_COPY optimize layout for minimal number of chunks. */ lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t)); if (reclen > lwb_sp || (reclen + dlen > lwb_sp && lwb_sp < zil_max_waste_space(zilog) && (dlen % max_log_data == 0 || lwb_sp < reclen + dlen % max_log_data))) { list_insert_tail(ilwbs, lwb); lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED); if (lwb == NULL) return (NULL); lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; /* * There must be enough space in the new, empty log block to * hold reclen. For WR_COPIED, we need to fit the whole * record in one block, and reclen is the header size + the * data size. For WR_NEED_COPY, we can create multiple * records, splitting the data into multiple blocks, so we * only need to fit one word of data per block; in this case * reclen is just the header size (no data). */ ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); } dnow = MIN(dlen, lwb_sp - reclen); if (dlen > dnow) { ASSERT3U(lr->lrc_txtype, ==, TX_WRITE); ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY); citx = zil_itx_clone(itx); clr = &citx->itx_lr; lr_write_t *clrw = (lr_write_t *)clr; clrw->lr_length = dnow; lrw->lr_offset += dnow; lrw->lr_length -= dnow; } else { citx = itx; clr = lr; } /* * We're actually making an entry, so update lrc_seq to be the * log record sequence number. Note that this is generally not * equal to the itx sequence number because not all transactions * are synchronous, and sometimes spa_sync() gets there first. */ clr->lrc_seq = ++zilog->zl_lr_seq; lwb->lwb_nused += reclen + dnow; ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax); ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); zil_lwb_add_txg(lwb, lr->lrc_txg); list_insert_tail(&lwb->lwb_itxs, citx); dlen -= dnow; if (dlen > 0) { zilog->zl_cur_used += reclen; goto cont; } if (lr->lrc_txtype == TX_WRITE && lr->lrc_txg > spa_freeze_txg(zilog->zl_spa)) txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); return (lwb); } /* * Fill the actual transaction data into the lwb, following zil_lwb_assign(). * Does not require locking. */ static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) { lr_t *lr, *lrb; lr_write_t *lrw, *lrwb; char *lr_buf; uint64_t dlen, reclen; lr = &itx->itx_lr; lrw = (lr_write_t *)lr; if (lr->lrc_txtype == TX_COMMIT) return; if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); } else { dlen = 0; } reclen = lr->lrc_reclen; ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled); lr_buf = lwb->lwb_buf + lwb->lwb_nfilled; memcpy(lr_buf, lr, reclen); lrb = (lr_t *)lr_buf; /* Like lr, but inside lwb. */ lrwb = (lr_write_t *)lrb; /* Like lrw, but inside lwb. */ ZIL_STAT_BUMP(zilog, zil_itx_count); /* * If it's a write, fetch the data or get its blkptr as appropriate. */ if (lr->lrc_txtype == TX_WRITE) { if (itx->itx_wr_state == WR_COPIED) { ZIL_STAT_BUMP(zilog, zil_itx_copied_count); ZIL_STAT_INCR(zilog, zil_itx_copied_bytes, lrw->lr_length); } else { char *dbuf; int error; if (itx->itx_wr_state == WR_NEED_COPY) { dbuf = lr_buf + reclen; lrb->lrc_reclen += dlen; ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count); ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes, dlen); } else { ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT); dbuf = NULL; ZIL_STAT_BUMP(zilog, zil_itx_indirect_count); ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes, lrw->lr_length); if (lwb->lwb_child_zio == NULL) { lwb->lwb_child_zio = zio_root( zilog->zl_spa, NULL, NULL, ZIO_FLAG_CANFAIL); } } /* * The "lwb_child_zio" we pass in will become a child of * "lwb_write_zio", when one is created, so one will be * a parent of any zio's created by the "zl_get_data". * This way "lwb_write_zio" will first wait for children * block pointers before own writing, and then for their * writing completion before the vdev cache flushing. */ error = zilog->zl_get_data(itx->itx_private, itx->itx_gen, lrwb, dbuf, lwb, lwb->lwb_child_zio); if (dbuf != NULL && error == 0) { /* Zero any padding bytes in the last block. */ memset((char *)dbuf + lrwb->lr_length, 0, dlen - lrwb->lr_length); } /* * Typically, the only return values we should see from * ->zl_get_data() are 0, EIO, ENOENT, EEXIST or * EALREADY. However, it is also possible to see other * error values such as ENOSPC or EINVAL from * dmu_read() -> dnode_hold() -> dnode_hold_impl() or * ENXIO as well as a multitude of others from the * block layer through dmu_buf_hold() -> dbuf_read() * -> zio_wait(), as well as through dmu_read() -> * dnode_hold() -> dnode_hold_impl() -> dbuf_read() -> * zio_wait(). When these errors happen, we can assume * that neither an immediate write nor an indirect * write occurred, so we need to fall back to * txg_wait_synced(). This is unusual, so we print to * dmesg whenever one of these errors occurs. */ switch (error) { case 0: break; default: cmn_err(CE_WARN, "zil_lwb_commit() received " "unexpected error %d from ->zl_get_data()" ". Falling back to txg_wait_synced().", error); zfs_fallthrough; case EIO: txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); zfs_fallthrough; case ENOENT: zfs_fallthrough; case EEXIST: zfs_fallthrough; case EALREADY: return; } } } lwb->lwb_nfilled += reclen + dlen; ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused); ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t))); } itx_t * zil_itx_create(uint64_t txtype, size_t olrsize) { size_t itxsize, lrsize; itx_t *itx; lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t); itxsize = offsetof(itx_t, itx_lr) + lrsize; itx = zio_data_buf_alloc(itxsize); itx->itx_lr.lrc_txtype = txtype; itx->itx_lr.lrc_reclen = lrsize; itx->itx_lr.lrc_seq = 0; /* defensive */ memset((char *)&itx->itx_lr + olrsize, 0, lrsize - olrsize); itx->itx_sync = B_TRUE; /* default is synchronous */ itx->itx_callback = NULL; itx->itx_callback_data = NULL; itx->itx_size = itxsize; return (itx); } static itx_t * zil_itx_clone(itx_t *oitx) { itx_t *itx = zio_data_buf_alloc(oitx->itx_size); memcpy(itx, oitx, oitx->itx_size); itx->itx_callback = NULL; itx->itx_callback_data = NULL; return (itx); } void zil_itx_destroy(itx_t *itx) { IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL); IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); if (itx->itx_callback != NULL) itx->itx_callback(itx->itx_callback_data); zio_data_buf_free(itx, itx->itx_size); } /* * Free up the sync and async itxs. The itxs_t has already been detached * so no locks are needed. */ static void zil_itxg_clean(void *arg) { itx_t *itx; list_t *list; avl_tree_t *t; void *cookie; itxs_t *itxs = arg; itx_async_node_t *ian; list = &itxs->i_sync_list; while ((itx = list_remove_head(list)) != NULL) { /* * In the general case, commit itxs will not be found * here, as they'll be committed to an lwb via * zil_lwb_assign(), and free'd in that function. Having * said that, it is still possible for commit itxs to be * found here, due to the following race: * * - a thread calls zil_commit() which assigns the * commit itx to a per-txg i_sync_list * - zil_itxg_clean() is called (e.g. via spa_sync()) * while the waiter is still on the i_sync_list * * There's nothing to prevent syncing the txg while the * waiter is on the i_sync_list. This normally doesn't * happen because spa_sync() is slower than zil_commit(), * but if zil_commit() calls txg_wait_synced() (e.g. * because zil_create() or zil_commit_writer_stall() is * called) we will hit this case. */ if (itx->itx_lr.lrc_txtype == TX_COMMIT) zil_commit_waiter_skip(itx->itx_private); zil_itx_destroy(itx); } cookie = NULL; t = &itxs->i_async_tree; while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list = &ian->ia_list; while ((itx = list_remove_head(list)) != NULL) { /* commit itxs should never be on the async lists. */ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); zil_itx_destroy(itx); } list_destroy(list); kmem_free(ian, sizeof (itx_async_node_t)); } avl_destroy(t); kmem_free(itxs, sizeof (itxs_t)); } static int zil_aitx_compare(const void *x1, const void *x2) { const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; return (TREE_CMP(o1, o2)); } /* * Remove all async itx with the given oid. */ void zil_remove_async(zilog_t *zilog, uint64_t oid) { uint64_t otxg, txg; itx_async_node_t *ian; avl_tree_t *t; avl_index_t where; list_t clean_list; itx_t *itx; ASSERT(oid != 0); list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * Locate the object node and append its list. */ t = &itxg->itxg_itxs->i_async_tree; ian = avl_find(t, &oid, &where); if (ian != NULL) list_move_tail(&clean_list, &ian->ia_list); mutex_exit(&itxg->itxg_lock); } while ((itx = list_remove_head(&clean_list)) != NULL) { /* commit itxs should never be on the async lists. */ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); zil_itx_destroy(itx); } list_destroy(&clean_list); } void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) { uint64_t txg; itxg_t *itxg; itxs_t *itxs, *clean = NULL; /* * Ensure the data of a renamed file is committed before the rename. */ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) zil_async_to_sync(zilog, itx->itx_oid); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) txg = ZILTEST_TXG; else txg = dmu_tx_get_txg(tx); itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); itxs = itxg->itxg_itxs; if (itxg->itxg_txg != txg) { if (itxs != NULL) { /* * The zil_clean callback hasn't got around to cleaning * this itxg. Save the itxs for release below. * This should be rare. */ zfs_dbgmsg("zil_itx_assign: missed itx cleanup for " "txg %llu", (u_longlong_t)itxg->itxg_txg); clean = itxg->itxg_itxs; } itxg->itxg_txg = txg; itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); list_create(&itxs->i_sync_list, sizeof (itx_t), offsetof(itx_t, itx_node)); avl_create(&itxs->i_async_tree, zil_aitx_compare, sizeof (itx_async_node_t), offsetof(itx_async_node_t, ia_node)); } if (itx->itx_sync) { list_insert_tail(&itxs->i_sync_list, itx); } else { avl_tree_t *t = &itxs->i_async_tree; uint64_t foid = LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid); itx_async_node_t *ian; avl_index_t where; ian = avl_find(t, &foid, &where); if (ian == NULL) { ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); list_create(&ian->ia_list, sizeof (itx_t), offsetof(itx_t, itx_node)); ian->ia_foid = foid; avl_insert(t, ian, where); } list_insert_tail(&ian->ia_list, itx); } itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); /* * We don't want to dirty the ZIL using ZILTEST_TXG, because * zil_clean() will never be called using ZILTEST_TXG. Thus, we * need to be careful to always dirty the ZIL using the "real" * TXG (not itxg_txg) even when the SPA is frozen. */ zilog_dirty(zilog, dmu_tx_get_txg(tx)); mutex_exit(&itxg->itxg_lock); /* Release the old itxs now we've dropped the lock */ if (clean != NULL) zil_itxg_clean(clean); } /* * If there are any in-memory intent log transactions which have now been * synced then start up a taskq to free them. We should only do this after we * have written out the uberblocks (i.e. txg has been committed) so that * don't inadvertently clean out in-memory log records that would be required * by zil_commit(). */ void zil_clean(zilog_t *zilog, uint64_t synced_txg) { itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; itxs_t *clean_me; ASSERT3U(synced_txg, <, ZILTEST_TXG); mutex_enter(&itxg->itxg_lock); if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { mutex_exit(&itxg->itxg_lock); return; } ASSERT3U(itxg->itxg_txg, <=, synced_txg); ASSERT3U(itxg->itxg_txg, !=, 0); clean_me = itxg->itxg_itxs; itxg->itxg_itxs = NULL; itxg->itxg_txg = 0; mutex_exit(&itxg->itxg_lock); /* * Preferably start a task queue to free up the old itxs but * if taskq_dispatch can't allocate resources to do that then * free it in-line. This should be rare. Note, using TQ_SLEEP * created a bad performance problem. */ ASSERT3P(zilog->zl_dmu_pool, !=, NULL); ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL); taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq, zil_itxg_clean, clean_me, TQ_NOSLEEP); if (id == TASKQID_INVALID) zil_itxg_clean(clean_me); } /* * This function will traverse the queue of itxs that need to be * committed, and move them onto the ZIL's zl_itx_commit_list. */ static uint64_t zil_get_commit_list(zilog_t *zilog) { uint64_t otxg, txg, wtxg = 0; list_t *commit_list = &zilog->zl_itx_commit_list; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; /* * This is inherently racy, since there is nothing to prevent * the last synced txg from changing. That's okay since we'll * only commit things in the future. */ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * If we're adding itx records to the zl_itx_commit_list, * then the zil better be dirty in this "txg". We can assert * that here since we're holding the itxg_lock which will * prevent spa_sync from cleaning it. Once we add the itxs * to the zl_itx_commit_list we must commit it to disk even * if it's unnecessary (i.e. the txg was synced). */ ASSERT(zilog_is_dirty_in_txg(zilog, txg) || spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); list_t *sync_list = &itxg->itxg_itxs->i_sync_list; if (unlikely(zilog->zl_suspend > 0)) { /* * ZIL was just suspended, but we lost the race. * Allow all earlier itxs to be committed, but ask * caller to do txg_wait_synced(txg) for any new. */ if (!list_is_empty(sync_list)) wtxg = MAX(wtxg, txg); } else { list_move_tail(commit_list, sync_list); } mutex_exit(&itxg->itxg_lock); } return (wtxg); } /* * Move the async itxs for a specified object to commit into sync lists. */ void zil_async_to_sync(zilog_t *zilog, uint64_t foid) { uint64_t otxg, txg; itx_async_node_t *ian; avl_tree_t *t; avl_index_t where; if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; /* * This is inherently racy, since there is nothing to prevent * the last synced txg from changing. */ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * If a foid is specified then find that node and append its * list. Otherwise walk the tree appending all the lists * to the sync list. We add to the end rather than the * beginning to ensure the create has happened. */ t = &itxg->itxg_itxs->i_async_tree; if (foid != 0) { ian = avl_find(t, &foid, &where); if (ian != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); } } else { void *cookie = NULL; while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); list_destroy(&ian->ia_list); kmem_free(ian, sizeof (itx_async_node_t)); } } mutex_exit(&itxg->itxg_lock); } } /* * This function will prune commit itxs that are at the head of the * commit list (it won't prune past the first non-commit itx), and * either: a) attach them to the last lwb that's still pending * completion, or b) skip them altogether. * * This is used as a performance optimization to prevent commit itxs * from generating new lwbs when it's unnecessary to do so. */ static void zil_prune_commit_list(zilog_t *zilog) { itx_t *itx; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) { lr_t *lrc = &itx->itx_lr; if (lrc->lrc_txtype != TX_COMMIT) break; mutex_enter(&zilog->zl_lock); lwb_t *last_lwb = zilog->zl_last_lwb_opened; if (last_lwb == NULL || last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) { /* * All of the itxs this waiter was waiting on * must have already completed (or there were * never any itx's for it to wait on), so it's * safe to skip this waiter and mark it done. */ zil_commit_waiter_skip(itx->itx_private); } else { zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); } mutex_exit(&zilog->zl_lock); list_remove(&zilog->zl_itx_commit_list, itx); zil_itx_destroy(itx); } IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); } static void zil_commit_writer_stall(zilog_t *zilog) { /* * When zio_alloc_zil() fails to allocate the next lwb block on * disk, we must call txg_wait_synced() to ensure all of the * lwbs in the zilog's zl_lwb_list are synced and then freed (in * zil_sync()), such that any subsequent ZIL writer (i.e. a call * to zil_process_commit_list()) will have to call zil_create(), * and start a new ZIL chain. * * Since zil_alloc_zil() failed, the lwb that was previously * issued does not have a pointer to the "next" lwb on disk. * Thus, if another ZIL writer thread was to allocate the "next" * on-disk lwb, that block could be leaked in the event of a * crash (because the previous lwb on-disk would not point to * it). * * We must hold the zilog's zl_issuer_lock while we do this, to * ensure no new threads enter zil_process_commit_list() until * all lwb's in the zl_lwb_list have been synced and freed * (which is achieved via the txg_wait_synced() call). */ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); txg_wait_synced(zilog->zl_dmu_pool, 0); ASSERT(list_is_empty(&zilog->zl_lwb_list)); } /* * This function will traverse the commit list, creating new lwbs as * needed, and committing the itxs from the commit list to these newly * created lwbs. Additionally, as a new lwb is created, the previous * lwb will be issued to the zio layer to be written to disk. */ static void zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) { spa_t *spa = zilog->zl_spa; list_t nolwb_itxs; list_t nolwb_waiters; lwb_t *lwb, *plwb; itx_t *itx; boolean_t first = B_TRUE; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); /* * Return if there's nothing to commit before we dirty the fs by * calling zil_create(). */ if (list_is_empty(&zilog->zl_itx_commit_list)) return; list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), offsetof(zil_commit_waiter_t, zcw_node)); lwb = list_tail(&zilog->zl_lwb_list); if (lwb == NULL) { lwb = zil_create(zilog); } else { /* * Activate SPA_FEATURE_ZILSAXATTR for the cases where ZIL will * have already been created (zl_lwb_list not empty). */ zil_commit_activate_saxattr_feature(zilog); ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_OPENED); first = (lwb->lwb_state == LWB_STATE_NEW) && ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL || plwb->lwb_state == LWB_STATE_FLUSH_DONE); } while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) { lr_t *lrc = &itx->itx_lr; uint64_t txg = lrc->lrc_txg; ASSERT3U(txg, !=, 0); if (lrc->lrc_txtype == TX_COMMIT) { DTRACE_PROBE2(zil__process__commit__itx, zilog_t *, zilog, itx_t *, itx); } else { DTRACE_PROBE2(zil__process__normal__itx, zilog_t *, zilog, itx_t *, itx); } boolean_t synced = txg <= spa_last_synced_txg(spa); boolean_t frozen = txg > spa_freeze_txg(spa); /* * If the txg of this itx has already been synced out, then * we don't need to commit this itx to an lwb. This is * because the data of this itx will have already been * written to the main pool. This is inherently racy, and * it's still ok to commit an itx whose txg has already * been synced; this will result in a write that's * unnecessary, but will do no harm. * * With that said, we always want to commit TX_COMMIT itxs * to an lwb, regardless of whether or not that itx's txg * has been synced out. We do this to ensure any OPENED lwb * will always have at least one zil_commit_waiter_t linked * to the lwb. * * As a counter-example, if we skipped TX_COMMIT itx's * whose txg had already been synced, the following * situation could occur if we happened to be racing with * spa_sync: * * 1. We commit a non-TX_COMMIT itx to an lwb, where the * itx's txg is 10 and the last synced txg is 9. * 2. spa_sync finishes syncing out txg 10. * 3. We move to the next itx in the list, it's a TX_COMMIT * whose txg is 10, so we skip it rather than committing * it to the lwb used in (1). * * If the itx that is skipped in (3) is the last TX_COMMIT * itx in the commit list, than it's possible for the lwb * used in (1) to remain in the OPENED state indefinitely. * * To prevent the above scenario from occurring, ensuring * that once an lwb is OPENED it will transition to ISSUED * and eventually DONE, we always commit TX_COMMIT itx's to * an lwb here, even if that itx's txg has already been * synced. * * Finally, if the pool is frozen, we _always_ commit the * itx. The point of freezing the pool is to prevent data * from being written to the main pool via spa_sync, and * instead rely solely on the ZIL to persistently store the * data; i.e. when the pool is frozen, the last synced txg * value can't be trusted. */ if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { if (lwb != NULL) { lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs); if (lwb == NULL) { list_insert_tail(&nolwb_itxs, itx); } else if ((zcw->zcw_lwb != NULL && zcw->zcw_lwb != lwb) || zcw->zcw_done) { /* * Our lwb is done, leave the rest of * itx list to somebody else who care. */ first = B_FALSE; break; } } else { if (lrc->lrc_txtype == TX_COMMIT) { zil_commit_waiter_link_nolwb( itx->itx_private, &nolwb_waiters); } list_insert_tail(&nolwb_itxs, itx); } } else { ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT); zil_itx_destroy(itx); } } if (lwb == NULL) { /* * This indicates zio_alloc_zil() failed to allocate the * "next" lwb on-disk. When this happens, we must stall * the ZIL write pipeline; see the comment within * zil_commit_writer_stall() for more details. */ while ((lwb = list_remove_head(ilwbs)) != NULL) zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); /* * Additionally, we have to signal and mark the "nolwb" * waiters as "done" here, since without an lwb, we * can't do this via zil_lwb_flush_vdevs_done() like * normal. */ zil_commit_waiter_t *zcw; while ((zcw = list_remove_head(&nolwb_waiters)) != NULL) zil_commit_waiter_skip(zcw); /* * And finally, we have to destroy the itx's that * couldn't be committed to an lwb; this will also call * the itx's callback if one exists for the itx. */ while ((itx = list_remove_head(&nolwb_itxs)) != NULL) zil_itx_destroy(itx); } else { ASSERT(list_is_empty(&nolwb_waiters)); ASSERT3P(lwb, !=, NULL); ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_OPENED); /* * At this point, the ZIL block pointed at by the "lwb" * variable is in "new" or "opened" state. * * If it's "new", then no itxs have been committed to it, so * there's no point in issuing its zio (i.e. it's "empty"). * * If it's "opened", then it contains one or more itxs that * eventually need to be committed to stable storage. In * this case we intentionally do not issue the lwb's zio * to disk yet, and instead rely on one of the following * two mechanisms for issuing the zio: * * 1. Ideally, there will be more ZIL activity occurring on * the system, such that this function will be immediately * called again by different thread and this lwb will be * closed by zil_lwb_assign(). This way, the lwb will be * "full" when it is issued to disk, and we'll make use of * the lwb's size the best we can. * * 2. If there isn't sufficient ZIL activity occurring on * the system, zil_commit_waiter() will close it and issue * the zio. If this occurs, the lwb is not guaranteed * to be "full" by the time its zio is issued, and means * the size of the lwb was "too large" given the amount * of ZIL activity occurring on the system at that time. * * We do this for a couple of reasons: * * 1. To try and reduce the number of IOPs needed to * write the same number of itxs. If an lwb has space * available in its buffer for more itxs, and more itxs * will be committed relatively soon (relative to the * latency of performing a write), then it's beneficial * to wait for these "next" itxs. This way, more itxs * can be committed to stable storage with fewer writes. * * 2. To try and use the largest lwb block size that the * incoming rate of itxs can support. Again, this is to * try and pack as many itxs into as few lwbs as * possible, without significantly impacting the latency * of each individual itx. * * If we had no already running or open LWBs, it can be * the workload is single-threaded. And if the ZIL write * latency is very small or if the LWB is almost full, it * may be cheaper to bypass the delay. */ if (lwb->lwb_state == LWB_STATE_OPENED && first) { hrtime_t sleep = zilog->zl_last_lwb_latency * zfs_commit_timeout_pct / 100; if (sleep < zil_min_commit_timeout || lwb->lwb_nmax - lwb->lwb_nused < lwb->lwb_nmax / 8) { list_insert_tail(ilwbs, lwb); lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); zilog->zl_cur_used = 0; if (lwb == NULL) { while ((lwb = list_remove_head(ilwbs)) != NULL) zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); } } } } } /* * This function is responsible for ensuring the passed in commit waiter * (and associated commit itx) is committed to an lwb. If the waiter is * not already committed to an lwb, all itxs in the zilog's queue of * itxs will be processed. The assumption is the passed in waiter's * commit itx will found in the queue just like the other non-commit * itxs, such that when the entire queue is processed, the waiter will * have been committed to an lwb. * * The lwb associated with the passed in waiter is not guaranteed to * have been issued by the time this function completes. If the lwb is * not issued, we rely on future calls to zil_commit_writer() to issue * the lwb, or the timeout mechanism found in zil_commit_waiter(). */ static uint64_t zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) { list_t ilwbs; lwb_t *lwb; uint64_t wtxg = 0; ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(spa_writeable(zilog->zl_spa)); list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node)); mutex_enter(&zilog->zl_issuer_lock); if (zcw->zcw_lwb != NULL || zcw->zcw_done) { /* * It's possible that, while we were waiting to acquire * the "zl_issuer_lock", another thread committed this * waiter to an lwb. If that occurs, we bail out early, * without processing any of the zilog's queue of itxs. * * On certain workloads and system configurations, the * "zl_issuer_lock" can become highly contended. In an * attempt to reduce this contention, we immediately drop * the lock if the waiter has already been processed. * * We've measured this optimization to reduce CPU spent * contending on this lock by up to 5%, using a system * with 32 CPUs, low latency storage (~50 usec writes), * and 1024 threads performing sync writes. */ goto out; } ZIL_STAT_BUMP(zilog, zil_commit_writer_count); wtxg = zil_get_commit_list(zilog); zil_prune_commit_list(zilog); zil_process_commit_list(zilog, zcw, &ilwbs); out: mutex_exit(&zilog->zl_issuer_lock); while ((lwb = list_remove_head(&ilwbs)) != NULL) zil_lwb_write_issue(zilog, lwb); list_destroy(&ilwbs); return (wtxg); } static void zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) { ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT(MUTEX_HELD(&zcw->zcw_lock)); ASSERT3B(zcw->zcw_done, ==, B_FALSE); lwb_t *lwb = zcw->zcw_lwb; ASSERT3P(lwb, !=, NULL); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW); /* * If the lwb has already been issued by another thread, we can * immediately return since there's no work to be done (the * point of this function is to issue the lwb). Additionally, we * do this prior to acquiring the zl_issuer_lock, to avoid * acquiring it when it's not necessary to do so. */ if (lwb->lwb_state != LWB_STATE_OPENED) return; /* * In order to call zil_lwb_write_close() we must hold the * zilog's "zl_issuer_lock". We can't simply acquire that lock, * since we're already holding the commit waiter's "zcw_lock", * and those two locks are acquired in the opposite order * elsewhere. */ mutex_exit(&zcw->zcw_lock); mutex_enter(&zilog->zl_issuer_lock); mutex_enter(&zcw->zcw_lock); /* * Since we just dropped and re-acquired the commit waiter's * lock, we have to re-check to see if the waiter was marked * "done" during that process. If the waiter was marked "done", * the "lwb" pointer is no longer valid (it can be free'd after * the waiter is marked "done"), so without this check we could * wind up with a use-after-free error below. */ if (zcw->zcw_done) { mutex_exit(&zilog->zl_issuer_lock); return; } ASSERT3P(lwb, ==, zcw->zcw_lwb); /* * We've already checked this above, but since we hadn't acquired * the zilog's zl_issuer_lock, we have to perform this check a * second time while holding the lock. * * We don't need to hold the zl_lock since the lwb cannot transition * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb * _can_ transition from CLOSED to DONE, but it's OK to race with * that transition since we treat the lwb the same, whether it's in * the CLOSED, ISSUED or DONE states. * * The important thing, is we treat the lwb differently depending on * if it's OPENED or CLOSED, and block any other threads that might * attempt to close/issue this lwb. For that reason we hold the * zl_issuer_lock when checking the lwb_state; we must not call * zil_lwb_write_close() if the lwb had already been closed/issued. * * See the comment above the lwb_state_t structure definition for * more details on the lwb states, and locking requirements. */ if (lwb->lwb_state != LWB_STATE_OPENED) { mutex_exit(&zilog->zl_issuer_lock); return; } /* * We do not need zcw_lock once we hold zl_issuer_lock and know lwb * is still open. But we have to drop it to avoid a deadlock in case * callback of zio issued by zil_lwb_write_issue() try to get it, * while zil_lwb_write_issue() is blocked on attempt to issue next * lwb it found in LWB_STATE_READY state. */ mutex_exit(&zcw->zcw_lock); /* * As described in the comments above zil_commit_waiter() and * zil_process_commit_list(), we need to issue this lwb's zio * since we've reached the commit waiter's timeout and it still * hasn't been issued. */ lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); /* * Since the lwb's zio hadn't been issued by the time this thread * reached its timeout, we reset the zilog's "zl_cur_used" field * to influence the zil block size selection algorithm. * * By having to issue the lwb's zio here, it means the size of the * lwb was too large, given the incoming throughput of itxs. By * setting "zl_cur_used" to zero, we communicate this fact to the * block size selection algorithm, so it can take this information * into account, and potentially select a smaller size for the * next lwb block that is allocated. */ zilog->zl_cur_used = 0; if (nlwb == NULL) { /* * When zil_lwb_write_close() returns NULL, this * indicates zio_alloc_zil() failed to allocate the * "next" lwb on-disk. When this occurs, the ZIL write * pipeline must be stalled; see the comment within the * zil_commit_writer_stall() function for more details. */ zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); mutex_exit(&zilog->zl_issuer_lock); } else { mutex_exit(&zilog->zl_issuer_lock); zil_lwb_write_issue(zilog, lwb); } mutex_enter(&zcw->zcw_lock); } /* * This function is responsible for performing the following two tasks: * * 1. its primary responsibility is to block until the given "commit * waiter" is considered "done". * * 2. its secondary responsibility is to issue the zio for the lwb that * the given "commit waiter" is waiting on, if this function has * waited "long enough" and the lwb is still in the "open" state. * * Given a sufficient amount of itxs being generated and written using * the ZIL, the lwb's zio will be issued via the zil_lwb_assign() * function. If this does not occur, this secondary responsibility will * ensure the lwb is issued even if there is not other synchronous * activity on the system. * * For more details, see zil_process_commit_list(); more specifically, * the comment at the bottom of that function. */ static void zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) { ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT(spa_writeable(zilog->zl_spa)); mutex_enter(&zcw->zcw_lock); /* * The timeout is scaled based on the lwb latency to avoid * significantly impacting the latency of each individual itx. * For more details, see the comment at the bottom of the * zil_process_commit_list() function. */ int pct = MAX(zfs_commit_timeout_pct, 1); hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100; hrtime_t wakeup = gethrtime() + sleep; boolean_t timedout = B_FALSE; while (!zcw->zcw_done) { ASSERT(MUTEX_HELD(&zcw->zcw_lock)); lwb_t *lwb = zcw->zcw_lwb; /* * Usually, the waiter will have a non-NULL lwb field here, * but it's possible for it to be NULL as a result of * zil_commit() racing with spa_sync(). * * When zil_clean() is called, it's possible for the itxg * list (which may be cleaned via a taskq) to contain * commit itxs. When this occurs, the commit waiters linked * off of these commit itxs will not be committed to an * lwb. Additionally, these commit waiters will not be * marked done until zil_commit_waiter_skip() is called via * zil_itxg_clean(). * * Thus, it's possible for this commit waiter (i.e. the * "zcw" variable) to be found in this "in between" state; * where it's "zcw_lwb" field is NULL, and it hasn't yet * been skipped, so it's "zcw_done" field is still B_FALSE. */ IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW); if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) { ASSERT3B(timedout, ==, B_FALSE); /* * If the lwb hasn't been issued yet, then we * need to wait with a timeout, in case this * function needs to issue the lwb after the * timeout is reached; responsibility (2) from * the comment above this function. */ int rc = cv_timedwait_hires(&zcw->zcw_cv, &zcw->zcw_lock, wakeup, USEC2NSEC(1), CALLOUT_FLAG_ABSOLUTE); if (rc != -1 || zcw->zcw_done) continue; timedout = B_TRUE; zil_commit_waiter_timeout(zilog, zcw); if (!zcw->zcw_done) { /* * If the commit waiter has already been * marked "done", it's possible for the * waiter's lwb structure to have already * been freed. Thus, we can only reliably * make these assertions if the waiter * isn't done. */ ASSERT3P(lwb, ==, zcw->zcw_lwb); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); } } else { /* * If the lwb isn't open, then it must have already * been issued. In that case, there's no need to * use a timeout when waiting for the lwb to * complete. * * Additionally, if the lwb is NULL, the waiter * will soon be signaled and marked done via * zil_clean() and zil_itxg_clean(), so no timeout * is required. */ IMPLY(lwb != NULL, lwb->lwb_state == LWB_STATE_CLOSED || lwb->lwb_state == LWB_STATE_READY || lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE || lwb->lwb_state == LWB_STATE_FLUSH_DONE); cv_wait(&zcw->zcw_cv, &zcw->zcw_lock); } } mutex_exit(&zcw->zcw_lock); } static zil_commit_waiter_t * zil_alloc_commit_waiter(void) { zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP); cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL); mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL); list_link_init(&zcw->zcw_node); zcw->zcw_lwb = NULL; zcw->zcw_done = B_FALSE; zcw->zcw_zio_error = 0; return (zcw); } static void zil_free_commit_waiter(zil_commit_waiter_t *zcw) { ASSERT(!list_link_active(&zcw->zcw_node)); ASSERT3P(zcw->zcw_lwb, ==, NULL); ASSERT3B(zcw->zcw_done, ==, B_TRUE); mutex_destroy(&zcw->zcw_lock); cv_destroy(&zcw->zcw_cv); kmem_cache_free(zil_zcw_cache, zcw); } /* * This function is used to create a TX_COMMIT itx and assign it. This * way, it will be linked into the ZIL's list of synchronous itxs, and * then later committed to an lwb (or skipped) when * zil_process_commit_list() is called. */ static void zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) { dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); /* * Since we are not going to create any new dirty data, and we * can even help with clearing the existing dirty data, we * should not be subject to the dirty data based delays. We * use TXG_NOTHROTTLE to bypass the delay mechanism. */ VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); itx->itx_sync = B_TRUE; itx->itx_private = zcw; zil_itx_assign(zilog, itx, tx); dmu_tx_commit(tx); } /* * Commit ZFS Intent Log transactions (itxs) to stable storage. * * When writing ZIL transactions to the on-disk representation of the * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple * itxs can be committed to a single lwb. Once a lwb is written and * committed to stable storage (i.e. the lwb is written, and vdevs have * been flushed), each itx that was committed to that lwb is also * considered to be committed to stable storage. * * When an itx is committed to an lwb, the log record (lr_t) contained * by the itx is copied into the lwb's zio buffer, and once this buffer * is written to disk, it becomes an on-disk ZIL block. * * As itxs are generated, they're inserted into the ZIL's queue of * uncommitted itxs. The semantics of zil_commit() are such that it will * block until all itxs that were in the queue when it was called, are * committed to stable storage. * * If "foid" is zero, this means all "synchronous" and "asynchronous" * itxs, for all objects in the dataset, will be committed to stable * storage prior to zil_commit() returning. If "foid" is non-zero, all * "synchronous" itxs for all objects, but only "asynchronous" itxs * that correspond to the foid passed in, will be committed to stable * storage prior to zil_commit() returning. * * Generally speaking, when zil_commit() is called, the consumer doesn't * actually care about _all_ of the uncommitted itxs. Instead, they're * simply trying to waiting for a specific itx to be committed to disk, * but the interface(s) for interacting with the ZIL don't allow such * fine-grained communication. A better interface would allow a consumer * to create and assign an itx, and then pass a reference to this itx to * zil_commit(); such that zil_commit() would return as soon as that * specific itx was committed to disk (instead of waiting for _all_ * itxs to be committed). * * When a thread calls zil_commit() a special "commit itx" will be * generated, along with a corresponding "waiter" for this commit itx. * zil_commit() will wait on this waiter's CV, such that when the waiter * is marked done, and signaled, zil_commit() will return. * * This commit itx is inserted into the queue of uncommitted itxs. This * provides an easy mechanism for determining which itxs were in the * queue prior to zil_commit() having been called, and which itxs were * added after zil_commit() was called. * * The commit itx is special; it doesn't have any on-disk representation. * When a commit itx is "committed" to an lwb, the waiter associated * with it is linked onto the lwb's list of waiters. Then, when that lwb * completes, each waiter on the lwb's list is marked done and signaled * -- allowing the thread waiting on the waiter to return from zil_commit(). * * It's important to point out a few critical factors that allow us * to make use of the commit itxs, commit waiters, per-lwb lists of * commit waiters, and zio completion callbacks like we're doing: * * 1. The list of waiters for each lwb is traversed, and each commit * waiter is marked "done" and signaled, in the zio completion * callback of the lwb's zio[*]. * * * Actually, the waiters are signaled in the zio completion * callback of the root zio for the DKIOCFLUSHWRITECACHE commands * that are sent to the vdevs upon completion of the lwb zio. * * 2. When the itxs are inserted into the ZIL's queue of uncommitted * itxs, the order in which they are inserted is preserved[*]; as * itxs are added to the queue, they are added to the tail of * in-memory linked lists. * * When committing the itxs to lwbs (to be written to disk), they * are committed in the same order in which the itxs were added to * the uncommitted queue's linked list(s); i.e. the linked list of * itxs to commit is traversed from head to tail, and each itx is * committed to an lwb in that order. * * * To clarify: * * - the order of "sync" itxs is preserved w.r.t. other * "sync" itxs, regardless of the corresponding objects. * - the order of "async" itxs is preserved w.r.t. other * "async" itxs corresponding to the same object. * - the order of "async" itxs is *not* preserved w.r.t. other * "async" itxs corresponding to different objects. * - the order of "sync" itxs w.r.t. "async" itxs (or vice * versa) is *not* preserved, even for itxs that correspond * to the same object. * * For more details, see: zil_itx_assign(), zil_async_to_sync(), * zil_get_commit_list(), and zil_process_commit_list(). * * 3. The lwbs represent a linked list of blocks on disk. Thus, any * lwb cannot be considered committed to stable storage, until its * "previous" lwb is also committed to stable storage. This fact, * coupled with the fact described above, means that itxs are * committed in (roughly) the order in which they were generated. * This is essential because itxs are dependent on prior itxs. * Thus, we *must not* deem an itx as being committed to stable * storage, until *all* prior itxs have also been committed to * stable storage. * * To enforce this ordering of lwb zio's, while still leveraging as * much of the underlying storage performance as possible, we rely * on two fundamental concepts: * * 1. The creation and issuance of lwb zio's is protected by * the zilog's "zl_issuer_lock", which ensures only a single * thread is creating and/or issuing lwb's at a time * 2. The "previous" lwb is a child of the "current" lwb * (leveraging the zio parent-child dependency graph) * * By relying on this parent-child zio relationship, we can have * many lwb zio's concurrently issued to the underlying storage, * but the order in which they complete will be the same order in * which they were created. */ void zil_commit(zilog_t *zilog, uint64_t foid) { /* * We should never attempt to call zil_commit on a snapshot for * a couple of reasons: * * 1. A snapshot may never be modified, thus it cannot have any * in-flight itxs that would have modified the dataset. * * 2. By design, when zil_commit() is called, a commit itx will * be assigned to this zilog; as a result, the zilog will be * dirtied. We must not dirty the zilog of a snapshot; there's * checks in the code that enforce this invariant, and will * cause a panic if it's not upheld. */ ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE); if (zilog->zl_sync == ZFS_SYNC_DISABLED) return; if (!spa_writeable(zilog->zl_spa)) { /* * If the SPA is not writable, there should never be any * pending itxs waiting to be committed to disk. If that * weren't true, we'd skip writing those itxs out, and * would break the semantics of zil_commit(); thus, we're * verifying that truth before we return to the caller. */ ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); for (int i = 0; i < TXG_SIZE; i++) ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL); return; } /* * If the ZIL is suspended, we don't want to dirty it by calling * zil_commit_itx_assign() below, nor can we write out * lwbs like would be done in zil_commit_write(). Thus, we * simply rely on txg_wait_synced() to maintain the necessary * semantics, and avoid calling those functions altogether. */ if (zilog->zl_suspend > 0) { txg_wait_synced(zilog->zl_dmu_pool, 0); return; } zil_commit_impl(zilog, foid); } void zil_commit_impl(zilog_t *zilog, uint64_t foid) { ZIL_STAT_BUMP(zilog, zil_commit_count); /* * Move the "async" itxs for the specified foid to the "sync" * queues, such that they will be later committed (or skipped) * to an lwb when zil_process_commit_list() is called. * * Since these "async" itxs must be committed prior to this * call to zil_commit returning, we must perform this operation * before we call zil_commit_itx_assign(). */ zil_async_to_sync(zilog, foid); /* * We allocate a new "waiter" structure which will initially be * linked to the commit itx using the itx's "itx_private" field. * Since the commit itx doesn't represent any on-disk state, * when it's committed to an lwb, rather than copying the its * lr_t into the lwb's buffer, the commit itx's "waiter" will be * added to the lwb's list of waiters. Then, when the lwb is * committed to stable storage, each waiter in the lwb's list of * waiters will be marked "done", and signalled. * * We must create the waiter and assign the commit itx prior to * calling zil_commit_writer(), or else our specific commit itx * is not guaranteed to be committed to an lwb prior to calling * zil_commit_waiter(). */ zil_commit_waiter_t *zcw = zil_alloc_commit_waiter(); zil_commit_itx_assign(zilog, zcw); uint64_t wtxg = zil_commit_writer(zilog, zcw); zil_commit_waiter(zilog, zcw); if (zcw->zcw_zio_error != 0) { /* * If there was an error writing out the ZIL blocks that * this thread is waiting on, then we fallback to * relying on spa_sync() to write out the data this * thread is waiting on. Obviously this has performance * implications, but the expectation is for this to be * an exceptional case, and shouldn't occur often. */ DTRACE_PROBE2(zil__commit__io__error, zilog_t *, zilog, zil_commit_waiter_t *, zcw); txg_wait_synced(zilog->zl_dmu_pool, 0); } else if (wtxg != 0) { txg_wait_synced(zilog->zl_dmu_pool, wtxg); } zil_free_commit_waiter(zcw); } /* * Called in syncing context to free committed log blocks and update log header. */ void zil_sync(zilog_t *zilog, dmu_tx_t *tx) { zil_header_t *zh = zil_header_in_syncing_context(zilog); uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = zilog->zl_spa; uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; lwb_t *lwb; /* * We don't zero out zl_destroy_txg, so make sure we don't try * to destroy it twice. */ if (spa_sync_pass(spa) != 1) return; zil_lwb_flush_wait_all(zilog, txg); mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_stop_sync == 0); if (*replayed_seq != 0) { ASSERT(zh->zh_replay_seq < *replayed_seq); zh->zh_replay_seq = *replayed_seq; *replayed_seq = 0; } if (zilog->zl_destroy_txg == txg) { blkptr_t blk = zh->zh_log; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); ASSERT(list_is_empty(&zilog->zl_lwb_list)); memset(zh, 0, sizeof (zil_header_t)); memset(zilog->zl_replayed_seq, 0, sizeof (zilog->zl_replayed_seq)); if (zilog->zl_keep_first) { /* * If this block was part of log chain that couldn't * be claimed because a device was missing during * zil_claim(), but that device later returns, * then this block could erroneously appear valid. * To guard against this, assign a new GUID to the new * log chain so it doesn't matter what blk points to. */ zil_init_log_chain(zilog, &blk); zh->zh_log = blk; } else { /* * A destroyed ZIL chain can't contain any TX_SETSAXATTR * records. So, deactivate the feature for this dataset. * We activate it again when we start a new ZIL chain. */ if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) dsl_dataset_deactivate_feature(ds, SPA_FEATURE_ZILSAXATTR, tx); } } while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { zh->zh_log = lwb->lwb_blk; if (lwb->lwb_state != LWB_STATE_FLUSH_DONE || lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg) break; list_remove(&zilog->zl_lwb_list, lwb); if (!BP_IS_HOLE(&lwb->lwb_blk)) zio_free(spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); /* * If we don't have anything left in the lwb list then * we've had an allocation failure and we need to zero * out the zil_header blkptr so that we don't end * up freeing the same block twice. */ if (list_is_empty(&zilog->zl_lwb_list)) BP_ZERO(&zh->zh_log); } mutex_exit(&zilog->zl_lock); } static int zil_lwb_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; lwb_t *lwb = vbuf; list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t), offsetof(zil_commit_waiter_t, zcw_node)); avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } static void zil_lwb_dest(void *vbuf, void *unused) { (void) unused; lwb_t *lwb = vbuf; mutex_destroy(&lwb->lwb_vdev_lock); avl_destroy(&lwb->lwb_vdev_tree); list_destroy(&lwb->lwb_waiters); list_destroy(&lwb->lwb_itxs); } void zil_init(void) { zil_lwb_cache = kmem_cache_create("zil_lwb_cache", sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0); zil_zcw_cache = kmem_cache_create("zil_zcw_cache", sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zil_sums_init(&zil_sums_global); zil_kstats_global = kstat_create("zfs", 0, "zil", "misc", KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (zil_kstats_global != NULL) { zil_kstats_global->ks_data = &zil_stats; zil_kstats_global->ks_update = zil_kstats_global_update; zil_kstats_global->ks_private = NULL; kstat_install(zil_kstats_global); } } void zil_fini(void) { kmem_cache_destroy(zil_zcw_cache); kmem_cache_destroy(zil_lwb_cache); if (zil_kstats_global != NULL) { kstat_delete(zil_kstats_global); zil_kstats_global = NULL; } zil_sums_fini(&zil_sums_global); } void zil_set_sync(zilog_t *zilog, uint64_t sync) { zilog->zl_sync = sync; } void zil_set_logbias(zilog_t *zilog, uint64_t logbias) { zilog->zl_logbias = logbias; } zilog_t * zil_alloc(objset_t *os, zil_header_t *zh_phys) { zilog_t *zilog; zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); zilog->zl_header = zh_phys; zilog->zl_os = os; zilog->zl_spa = dmu_objset_spa(os); zilog->zl_dmu_pool = dmu_objset_pool(os); zilog->zl_destroy_txg = TXG_INITIAL - 1; zilog->zl_logbias = dmu_objset_logbias(os); zilog->zl_sync = dmu_objset_syncprop(os); zilog->zl_dirty_max_txg = 0; zilog->zl_last_lwb_opened = NULL; zilog->zl_last_lwb_latency = 0; zilog->zl_max_block_size = zil_maxblocksize; mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zilog->zl_lwb_io_lock, NULL, MUTEX_DEFAULT, NULL); for (int i = 0; i < TXG_SIZE; i++) { mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, MUTEX_DEFAULT, NULL); } list_create(&zilog->zl_lwb_list, sizeof (lwb_t), offsetof(lwb_t, lwb_node)); list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), offsetof(itx_t, itx_node)); cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL); return (zilog); } void zil_free(zilog_t *zilog) { int i; zilog->zl_stop_sync = 1; ASSERT0(zilog->zl_suspend); ASSERT0(zilog->zl_suspending); ASSERT(list_is_empty(&zilog->zl_lwb_list)); list_destroy(&zilog->zl_lwb_list); ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); list_destroy(&zilog->zl_itx_commit_list); for (i = 0; i < TXG_SIZE; i++) { /* * It's possible for an itx to be generated that doesn't dirty * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() * callback to remove the entry. We remove those here. * * Also free up the ziltest itxs. */ if (zilog->zl_itxg[i].itxg_itxs) zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); mutex_destroy(&zilog->zl_itxg[i].itxg_lock); } mutex_destroy(&zilog->zl_issuer_lock); mutex_destroy(&zilog->zl_lock); mutex_destroy(&zilog->zl_lwb_io_lock); cv_destroy(&zilog->zl_cv_suspend); cv_destroy(&zilog->zl_lwb_io_cv); kmem_free(zilog, sizeof (zilog_t)); } /* * Open an intent log. */ zilog_t * zil_open(objset_t *os, zil_get_data_t *get_data, zil_sums_t *zil_sums) { zilog_t *zilog = dmu_objset_zil(os); ASSERT3P(zilog->zl_get_data, ==, NULL); ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); ASSERT(list_is_empty(&zilog->zl_lwb_list)); zilog->zl_get_data = get_data; zilog->zl_sums = zil_sums; return (zilog); } /* * Close an intent log. */ void zil_close(zilog_t *zilog) { lwb_t *lwb; uint64_t txg; if (!dmu_objset_is_snapshot(zilog->zl_os)) { zil_commit(zilog, 0); } else { ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT0(zilog->zl_dirty_max_txg); ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE); } mutex_enter(&zilog->zl_lock); txg = zilog->zl_dirty_max_txg; lwb = list_tail(&zilog->zl_lwb_list); if (lwb != NULL) { txg = MAX(txg, lwb->lwb_alloc_txg); txg = MAX(txg, lwb->lwb_max_txg); } mutex_exit(&zilog->zl_lock); /* * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends * on the time when the dmu_tx transaction is assigned in * zil_lwb_write_issue(). */ mutex_enter(&zilog->zl_lwb_io_lock); txg = MAX(zilog->zl_lwb_max_issued_txg, txg); mutex_exit(&zilog->zl_lwb_io_lock); /* * We need to use txg_wait_synced() to wait until that txg is synced. * zil_sync() will guarantee all lwbs up to that txg have been * written out, flushed, and cleaned. */ if (txg != 0) txg_wait_synced(zilog->zl_dmu_pool, txg); if (zilog_is_dirty(zilog)) zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog, (u_longlong_t)txg); if (txg < spa_freeze_txg(zilog->zl_spa)) VERIFY(!zilog_is_dirty(zilog)); zilog->zl_get_data = NULL; /* * We should have only one lwb left on the list; remove it now. */ mutex_enter(&zilog->zl_lock); lwb = list_remove_head(&zilog->zl_lwb_list); if (lwb != NULL) { ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zil_free_lwb(zilog, lwb); } mutex_exit(&zilog->zl_lock); } static const char *suspend_tag = "zil suspending"; /* * Suspend an intent log. While in suspended mode, we still honor * synchronous semantics, but we rely on txg_wait_synced() to do it. * On old version pools, we suspend the log briefly when taking a * snapshot so that it will have an empty intent log. * * Long holds are not really intended to be used the way we do here -- * held for such a short time. A concurrent caller of dsl_dataset_long_held() * could fail. Therefore we take pains to only put a long hold if it is * actually necessary. Fortunately, it will only be necessary if the * objset is currently mounted (or the ZVOL equivalent). In that case it * will already have a long hold, so we are not really making things any worse. * * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or * zvol_state_t), and use their mechanism to prevent their hold from being * dropped (e.g. VFS_HOLD()). However, that would be even more pain for * very little gain. * * if cookiep == NULL, this does both the suspend & resume. * Otherwise, it returns with the dataset "long held", and the cookie * should be passed into zil_resume(). */ int zil_suspend(const char *osname, void **cookiep) { objset_t *os; zilog_t *zilog; const zil_header_t *zh; int error; error = dmu_objset_hold(osname, suspend_tag, &os); if (error != 0) return (error); zilog = dmu_objset_zil(os); mutex_enter(&zilog->zl_lock); zh = zilog->zl_header; if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ mutex_exit(&zilog->zl_lock); dmu_objset_rele(os, suspend_tag); return (SET_ERROR(EBUSY)); } /* * Don't put a long hold in the cases where we can avoid it. This * is when there is no cookie so we are doing a suspend & resume * (i.e. called from zil_vdev_offline()), and there's nothing to do * for the suspend because it's already suspended, or there's no ZIL. */ if (cookiep == NULL && !zilog->zl_suspending && (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { mutex_exit(&zilog->zl_lock); dmu_objset_rele(os, suspend_tag); return (0); } dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); dsl_pool_rele(dmu_objset_pool(os), suspend_tag); zilog->zl_suspend++; if (zilog->zl_suspend > 1) { /* * Someone else is already suspending it. * Just wait for them to finish. */ while (zilog->zl_suspending) cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); mutex_exit(&zilog->zl_lock); if (cookiep == NULL) zil_resume(os); else *cookiep = os; return (0); } /* * If there is no pointer to an on-disk block, this ZIL must not * be active (e.g. filesystem not mounted), so there's nothing * to clean up. */ if (BP_IS_HOLE(&zh->zh_log)) { ASSERT(cookiep != NULL); /* fast path already handled */ *cookiep = os; mutex_exit(&zilog->zl_lock); return (0); } /* * The ZIL has work to do. Ensure that the associated encryption * key will remain mapped while we are committing the log by * grabbing a reference to it. If the key isn't loaded we have no * choice but to return an error until the wrapping key is loaded. */ if (os->os_encrypted && dsl_dataset_create_key_mapping(dmu_objset_ds(os)) != 0) { zilog->zl_suspend--; mutex_exit(&zilog->zl_lock); dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); return (SET_ERROR(EACCES)); } zilog->zl_suspending = B_TRUE; mutex_exit(&zilog->zl_lock); /* * We need to use zil_commit_impl to ensure we wait for all * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed * to disk before proceeding. If we used zil_commit instead, it * would just call txg_wait_synced(), because zl_suspend is set. * txg_wait_synced() doesn't wait for these lwb's to be * LWB_STATE_FLUSH_DONE before returning. */ zil_commit_impl(zilog, 0); /* * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we * use txg_wait_synced() to ensure the data from the zilog has * migrated to the main pool before calling zil_destroy(). */ txg_wait_synced(zilog->zl_dmu_pool, 0); zil_destroy(zilog, B_FALSE); mutex_enter(&zilog->zl_lock); zilog->zl_suspending = B_FALSE; cv_broadcast(&zilog->zl_cv_suspend); mutex_exit(&zilog->zl_lock); if (os->os_encrypted) dsl_dataset_remove_key_mapping(dmu_objset_ds(os)); if (cookiep == NULL) zil_resume(os); else *cookiep = os; return (0); } void zil_resume(void *cookie) { objset_t *os = cookie; zilog_t *zilog = dmu_objset_zil(os); mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_suspend != 0); zilog->zl_suspend--; mutex_exit(&zilog->zl_lock); dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); } typedef struct zil_replay_arg { zil_replay_func_t *const *zr_replay; void *zr_arg; boolean_t zr_byteswap; char *zr_lr; } zil_replay_arg_t; static int zil_replay_error(zilog_t *zilog, const lr_t *lr, int error) { char name[ZFS_MAX_DATASET_NAME_LEN]; zilog->zl_replaying_seq--; /* didn't actually replay this one */ dmu_objset_name(zilog->zl_os, name); cmn_err(CE_WARN, "ZFS replay transaction error %d, " "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)(lr->lrc_txtype & ~TX_CI), (lr->lrc_txtype & TX_CI) ? "CI" : ""); return (error); } static int zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra, uint64_t claim_txg) { zil_replay_arg_t *zr = zra; const zil_header_t *zh = zilog->zl_header; uint64_t reclen = lr->lrc_reclen; uint64_t txtype = lr->lrc_txtype; int error = 0; zilog->zl_replaying_seq = lr->lrc_seq; if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ return (0); if (lr->lrc_txg < claim_txg) /* already committed */ return (0); /* Strip case-insensitive bit, still present in log record */ txtype &= ~TX_CI; if (txtype == 0 || txtype >= TX_MAX_TYPE) return (zil_replay_error(zilog, lr, EINVAL)); /* * If this record type can be logged out of order, the object * (lr_foid) may no longer exist. That's legitimate, not an error. */ if (TX_OOO(txtype)) { error = dmu_object_info(zilog->zl_os, LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL); if (error == ENOENT || error == EEXIST) return (0); } /* * Make a copy of the data so we can revise and extend it. */ memcpy(zr->zr_lr, lr, reclen); /* * If this is a TX_WRITE with a blkptr, suck in the data. */ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { error = zil_read_log_data(zilog, (lr_write_t *)lr, zr->zr_lr + reclen); if (error != 0) return (zil_replay_error(zilog, lr, error)); } /* * The log block containing this lr may have been byteswapped * so that we can easily examine common fields like lrc_txtype. * However, the log is a mix of different record types, and only the * replay vectors know how to byteswap their records. Therefore, if * the lr was byteswapped, undo it before invoking the replay vector. */ if (zr->zr_byteswap) byteswap_uint64_array(zr->zr_lr, reclen); /* * We must now do two things atomically: replay this log record, * and update the log header sequence number to reflect the fact that * we did so. At the end of each replay function the sequence number * is updated if we are in replay mode. */ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); if (error != 0) { /* * The DMU's dnode layer doesn't see removes until the txg * commits, so a subsequent claim can spuriously fail with * EEXIST. So if we receive any error we try syncing out * any removes then retry the transaction. Note that we * specify B_FALSE for byteswap now, so we don't do it twice. */ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); if (error != 0) return (zil_replay_error(zilog, lr, error)); } return (0); } static int zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg) { (void) bp, (void) arg, (void) claim_txg; zilog->zl_replay_blks++; return (0); } /* * If this dataset has a non-empty intent log, replay it and destroy it. * Return B_TRUE if there were any entries to replay. */ boolean_t zil_replay(objset_t *os, void *arg, zil_replay_func_t *const replay_func[TX_MAX_TYPE]) { zilog_t *zilog = dmu_objset_zil(os); const zil_header_t *zh = zilog->zl_header; zil_replay_arg_t zr; if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { return (zil_destroy(zilog, B_TRUE)); } zr.zr_replay = replay_func; zr.zr_arg = arg; zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); /* * Wait for in-progress removes to sync before starting replay. */ txg_wait_synced(zilog->zl_dmu_pool, 0); zilog->zl_replay = B_TRUE; zilog->zl_replay_time = ddi_get_lbolt(); ASSERT(zilog->zl_replay_blks == 0); (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, zh->zh_claim_txg, B_TRUE); vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); zil_destroy(zilog, B_FALSE); txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); zilog->zl_replay = B_FALSE; return (B_TRUE); } boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx) { if (zilog->zl_sync == ZFS_SYNC_DISABLED) return (B_TRUE); if (zilog->zl_replay) { dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = zilog->zl_replaying_seq; return (B_TRUE); } return (B_FALSE); } int zil_reset(const char *osname, void *arg) { (void) arg; int error = zil_suspend(osname, NULL); /* EACCES means crypto key not loaded */ if ((error == EACCES) || (error == EBUSY)) return (SET_ERROR(error)); if (error != 0) return (SET_ERROR(EEXIST)); return (0); } EXPORT_SYMBOL(zil_alloc); EXPORT_SYMBOL(zil_free); EXPORT_SYMBOL(zil_open); EXPORT_SYMBOL(zil_close); EXPORT_SYMBOL(zil_replay); EXPORT_SYMBOL(zil_replaying); EXPORT_SYMBOL(zil_destroy); EXPORT_SYMBOL(zil_destroy_sync); EXPORT_SYMBOL(zil_itx_create); EXPORT_SYMBOL(zil_itx_destroy); EXPORT_SYMBOL(zil_itx_assign); EXPORT_SYMBOL(zil_commit); EXPORT_SYMBOL(zil_claim); EXPORT_SYMBOL(zil_check_log_chain); EXPORT_SYMBOL(zil_sync); EXPORT_SYMBOL(zil_clean); EXPORT_SYMBOL(zil_suspend); EXPORT_SYMBOL(zil_resume); EXPORT_SYMBOL(zil_lwb_add_block); EXPORT_SYMBOL(zil_bp_tree_add); EXPORT_SYMBOL(zil_set_sync); EXPORT_SYMBOL(zil_set_logbias); EXPORT_SYMBOL(zil_sums_init); EXPORT_SYMBOL(zil_sums_fini); EXPORT_SYMBOL(zil_kstat_values_update); ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW, "ZIL block open timeout percentage"); ZFS_MODULE_PARAM(zfs_zil, zil_, min_commit_timeout, U64, ZMOD_RW, "Minimum delay we care for ZIL block commit"); ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW, "Disable intent logging replay"); ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW, "Disable ZIL cache flushes"); ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW, "Limit in bytes slog sync writes per commit"); ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW, "Limit in bytes of ZIL log block size"); + +ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW, + "Limit in bytes WR_COPIED size"); diff --git a/sys/contrib/openzfs/rpm/generic/zfs.spec.in b/sys/contrib/openzfs/rpm/generic/zfs.spec.in index 8c538a00d203..711e6c751dc0 100644 --- a/sys/contrib/openzfs/rpm/generic/zfs.spec.in +++ b/sys/contrib/openzfs/rpm/generic/zfs.spec.in @@ -1,580 +1,580 @@ %global _sbindir /sbin %global _libdir /%{_lib} # Set the default udev directory based on distribution. %if %{undefined _udevdir} %if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler} %global _udevdir %{_prefix}/lib/udev %else %global _udevdir /lib/udev %endif %endif # Set the default udevrule directory based on distribution. %if %{undefined _udevruledir} %if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler} %global _udevruledir %{_prefix}/lib/udev/rules.d %else %global _udevruledir /lib/udev/rules.d %endif %endif # Set the default dracut directory based on distribution. %if %{undefined _dracutdir} %if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler} %global _dracutdir %{_prefix}/lib/dracut %else %global _dracutdir %{_prefix}/share/dracut %endif %endif %if %{undefined _initconfdir} %global _initconfdir /etc/sysconfig %endif %if %{undefined _unitdir} %global _unitdir %{_prefix}/lib/systemd/system %endif %if %{undefined _presetdir} %global _presetdir %{_prefix}/lib/systemd/system-preset %endif %if %{undefined _modulesloaddir} %global _modulesloaddir %{_prefix}/lib/modules-load.d %endif %if %{undefined _systemdgeneratordir} %global _systemdgeneratordir %{_prefix}/lib/systemd/system-generators %endif %if %{undefined _pkgconfigdir} %global _pkgconfigdir %{_prefix}/%{_lib}/pkgconfig %endif %bcond_with debug %bcond_with debuginfo %bcond_with asan %bcond_with ubsan %bcond_with systemd %bcond_with pam %bcond_without pyzfs # Generic enable switch for systemd %if %{with systemd} %define _systemd 1 %endif # Distros below support systemd %if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler} %define _systemd 1 %endif # When not specified default to distribution provided version. %if %{undefined __use_python} %define __python /usr/bin/python3 %define __python_pkg_version 3 %else %define __python %{__use_python} %define __python_pkg_version %{__use_python_pkg_version} %endif %define __python_sitelib %(%{__python} -Esc "from distutils.sysconfig import get_python_lib; print(get_python_lib())" 2>/dev/null || %{__python} -Esc "import sysconfig; print(sysconfig.get_path('purelib'))") Name: @PACKAGE@ Version: @VERSION@ Release: @RELEASE@%{?dist} Summary: Commands to control the kernel modules and libraries Group: System Environment/Kernel License: @ZFS_META_LICENSE@ URL: https://github.com/openzfs/zfs Source0: %{name}-%{version}.tar.gz BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) Requires: libzpool5%{?_isa} = %{version}-%{release} Requires: libnvpair3%{?_isa} = %{version}-%{release} Requires: libuutil3%{?_isa} = %{version}-%{release} Requires: libzfs5%{?_isa} = %{version}-%{release} Requires: %{name}-kmod = %{version} Provides: %{name}-kmod-common = %{version}-%{release} Obsoletes: spl <= %{version} # zfs-fuse provides the same commands and man pages that OpenZFS does. # Renaming those on either side would conflict with all available documentation. Conflicts: zfs-fuse %if 0%{?rhel}%{?centos}%{?fedora}%{?suse_version}%{?openEuler} BuildRequires: gcc, make BuildRequires: zlib-devel BuildRequires: libuuid-devel BuildRequires: libblkid-devel BuildRequires: libudev-devel BuildRequires: libattr-devel BuildRequires: openssl-devel %if 0%{?fedora}%{?suse_version}%{?openEuler} || 0%{?rhel} >= 8 || 0%{?centos} >= 8 BuildRequires: libtirpc-devel %endif %if (0%{?fedora}%{?suse_version}%{?openEuler}) || (0%{?rhel} && 0%{?rhel} < 9) # We don't directly use it, but if this isn't installed, rpmbuild as root can # crash+corrupt rpmdb # See issue #12071 BuildRequires: ncompress %endif Requires: openssl %if 0%{?_systemd} BuildRequires: systemd %endif %endif %if 0%{?_systemd} Requires(post): systemd Requires(preun): systemd Requires(postun): systemd %endif # The zpool iostat/status -c scripts call some utilities like lsblk and iostat Requires: util-linux Requires: sysstat %description This package contains the core ZFS command line utilities. %package -n libzpool5 Summary: Native ZFS pool library for Linux Group: System Environment/Kernel Obsoletes: libzpool2 <= %{version} Obsoletes: libzpool4 <= %{version} %description -n libzpool5 This package contains the zpool library, which provides support for managing zpools %if %{defined ldconfig_scriptlets} %ldconfig_scriptlets -n libzpool5 %else %post -n libzpool5 -p /sbin/ldconfig %postun -n libzpool5 -p /sbin/ldconfig %endif %package -n libnvpair3 Summary: Solaris name-value library for Linux Group: System Environment/Kernel Obsoletes: libnvpair1 <= %{version} %description -n libnvpair3 This package contains routines for packing and unpacking name-value pairs. This functionality is used to portably transport data across process boundaries, between kernel and user space, and can be used to write self describing data structures on disk. %if %{defined ldconfig_scriptlets} %ldconfig_scriptlets -n libnvpair3 %else %post -n libnvpair3 -p /sbin/ldconfig %postun -n libnvpair3 -p /sbin/ldconfig %endif %package -n libuutil3 Summary: Solaris userland utility library for Linux Group: System Environment/Kernel Obsoletes: libuutil1 <= %{version} %description -n libuutil3 This library provides a variety of compatibility functions for OpenZFS: * libspl: The Solaris Porting Layer userland library, which provides APIs that make it possible to run Solaris user code in a Linux environment with relatively minimal modification. * libavl: The Adelson-Velskii Landis balanced binary tree manipulation library. * libefi: The Extensible Firmware Interface library for GUID disk partitioning. * libshare: NFS, SMB, and iSCSI service integration for ZFS. %if %{defined ldconfig_scriptlets} %ldconfig_scriptlets -n libuutil3 %else %post -n libuutil3 -p /sbin/ldconfig %postun -n libuutil3 -p /sbin/ldconfig %endif # The library version is encoded in the package name. When updating the # version information it is important to add an obsoletes line below for # the previous version of the package. %package -n libzfs5 Summary: Native ZFS filesystem library for Linux Group: System Environment/Kernel Obsoletes: libzfs2 <= %{version} Obsoletes: libzfs4 <= %{version} %description -n libzfs5 This package provides support for managing ZFS filesystems %if %{defined ldconfig_scriptlets} %ldconfig_scriptlets -n libzfs5 %else %post -n libzfs5 -p /sbin/ldconfig %postun -n libzfs5 -p /sbin/ldconfig %endif %package -n libzfs5-devel Summary: Development headers Group: System Environment/Kernel Requires: libzfs5%{?_isa} = %{version}-%{release} Requires: libzpool5%{?_isa} = %{version}-%{release} Requires: libnvpair3%{?_isa} = %{version}-%{release} Requires: libuutil3%{?_isa} = %{version}-%{release} Provides: libzpool5-devel = %{version}-%{release} Provides: libnvpair3-devel = %{version}-%{release} Provides: libuutil3-devel = %{version}-%{release} Obsoletes: zfs-devel <= %{version} Obsoletes: libzfs2-devel <= %{version} Obsoletes: libzfs4-devel <= %{version} %description -n libzfs5-devel This package contains the header files needed for building additional applications against the ZFS libraries. %package test Summary: Test infrastructure Group: System Environment/Kernel Requires: %{name}%{?_isa} = %{version}-%{release} Requires: parted Requires: lsscsi Requires: mdadm Requires: bc Requires: ksh Requires: fio Requires: acl Requires: sudo Requires: sysstat Requires: libaio Requires: python%{__python_pkg_version} %if 0%{?rhel}%{?centos}%{?fedora}%{?suse_version}%{?openEuler} BuildRequires: libaio-devel %endif AutoReqProv: no %description test This package contains test infrastructure and support scripts for validating the file system. %package dracut Summary: Dracut module Group: System Environment/Kernel BuildArch: noarch Requires: %{name} >= %{version} Requires: dracut Requires: /usr/bin/awk Requires: grep %description dracut This package contains a dracut module used to construct an initramfs image which is ZFS aware. %if %{with pyzfs} # Enforce `python36-` package prefix for CentOS 7 # since dependencies come from EPEL and are named this way %package -n python%{__python_pkg_version}-pyzfs Summary: Python %{python_version} wrapper for libzfs_core Group: Development/Languages/Python License: Apache-2.0 BuildArch: noarch Requires: libzfs5 = %{version}-%{release} Requires: libnvpair3 = %{version}-%{release} Requires: libffi Requires: python%{__python_pkg_version} %if 0%{?centos} == 7 Requires: python36-cffi %else Requires: python%{__python_pkg_version}-cffi %endif %if 0%{?rhel}%{?centos}%{?fedora}%{?suse_version}%{?openEuler} %if 0%{?centos} == 7 BuildRequires: python36-packaging BuildRequires: python36-devel BuildRequires: python36-cffi BuildRequires: python36-setuptools %else BuildRequires: python%{__python_pkg_version}-packaging BuildRequires: python%{__python_pkg_version}-devel BuildRequires: python%{__python_pkg_version}-cffi BuildRequires: python%{__python_pkg_version}-setuptools %endif BuildRequires: libffi-devel %endif %description -n python%{__python_pkg_version}-pyzfs This package provides a python wrapper for the libzfs_core C library. %endif %if 0%{?_initramfs} %package initramfs Summary: Initramfs module Group: System Environment/Kernel Requires: %{name}%{?_isa} = %{version}-%{release} Requires: initramfs-tools %description initramfs This package contains a initramfs module used to construct an initramfs image which is ZFS aware. %endif %if %{with pam} %package -n pam_zfs_key Summary: PAM module for encrypted ZFS datasets %if 0%{?rhel}%{?centos}%{?fedora}%{?suse_version}%{?openEuler} BuildRequires: pam-devel %endif %description -n pam_zfs_key This package contains the pam_zfs_key PAM module, which provides support for unlocking datasets on user login. %endif %prep %if %{with debug} %define debug --enable-debug %else %define debug --disable-debug %endif %if %{with debuginfo} %define debuginfo --enable-debuginfo %else %define debuginfo --disable-debuginfo %endif %if %{with asan} %define asan --enable-asan %else %define asan --disable-asan %endif %if %{with ubsan} %define ubsan --enable-ubsan %else %define ubsan --disable-ubsan %endif %if 0%{?_systemd} %define systemd --enable-systemd --with-systemdunitdir=%{_unitdir} --with-systemdpresetdir=%{_presetdir} --with-systemdmodulesloaddir=%{_modulesloaddir} --with-systemdgeneratordir=%{_systemdgeneratordir} --disable-sysvinit %define systemd_svcs zfs-import-cache.service zfs-import-scan.service zfs-mount.service zfs-share.service zfs-zed.service zfs.target zfs-import.target zfs-volume-wait.service zfs-volumes.target %else %define systemd --enable-sysvinit --disable-systemd %endif %if %{with pyzfs} %define pyzfs --enable-pyzfs %else %define pyzfs --disable-pyzfs %endif %if %{with pam} %define pam --enable-pam %else %define pam --disable-pam %endif %setup -q %build %configure \ --with-config=user \ --with-udevdir=%{_udevdir} \ --with-udevruledir=%{_udevruledir} \ --with-dracutdir=%{_dracutdir} \ --with-pamconfigsdir=%{_datadir}/pam-configs \ --with-pammoduledir=%{_libdir}/security \ --with-python=%{__python} \ --with-pkgconfigdir=%{_pkgconfigdir} \ --disable-static \ %{debug} \ %{debuginfo} \ %{asan} \ %{ubsan} \ %{systemd} \ %{pam} \ %{pyzfs} make %{?_smp_mflags} %install %{__rm} -rf $RPM_BUILD_ROOT make install DESTDIR=%{?buildroot} find %{?buildroot}%{_libdir} -name '*.la' -exec rm -f {} \; %if 0%{!?__brp_mangle_shebangs:1} find %{?buildroot}%{_bindir} \ \( -name arc_summary -or -name arcstat -or -name dbufstat \ -or -name zilstat \) \ -exec %{__sed} -i 's|^#!.*|#!%{__python}|' {} \; find %{?buildroot}%{_datadir} \ \( -name test-runner.py -or -name zts-report.py \) \ -exec %{__sed} -i 's|^#!.*|#!%{__python}|' {} \; %endif %post %if 0%{?_systemd} %if 0%{?systemd_post:1} %systemd_post %{systemd_svcs} %else if [ "$1" = "1" -o "$1" = "install" ] ; then # Initial installation systemctl preset %{systemd_svcs} >/dev/null || true fi %endif %else if [ -x /sbin/chkconfig ]; then /sbin/chkconfig --add zfs-import /sbin/chkconfig --add zfs-load-key /sbin/chkconfig --add zfs-mount /sbin/chkconfig --add zfs-share /sbin/chkconfig --add zfs-zed fi %endif exit 0 # On RHEL/CentOS 7 the static nodes aren't refreshed by default after # installing a package. This is the default behavior for Fedora. %posttrans %if 0%{?rhel} == 7 || 0%{?centos} == 7 systemctl restart kmod-static-nodes systemctl restart systemd-tmpfiles-setup-dev udevadm trigger %endif %preun %if 0%{?_systemd} %if 0%{?systemd_preun:1} %systemd_preun %{systemd_svcs} %else if [ "$1" = "0" -o "$1" = "remove" ] ; then # Package removal, not upgrade systemctl --no-reload disable %{systemd_svcs} >/dev/null || true systemctl stop %{systemd_svcs} >/dev/null || true fi %endif %else if [ "$1" = "0" -o "$1" = "remove" ] && [ -x /sbin/chkconfig ]; then /sbin/chkconfig --del zfs-import /sbin/chkconfig --del zfs-load-key /sbin/chkconfig --del zfs-mount /sbin/chkconfig --del zfs-share /sbin/chkconfig --del zfs-zed fi %endif exit 0 %postun %if 0%{?_systemd} %if 0%{?systemd_postun:1} %systemd_postun %{systemd_svcs} %else systemctl --system daemon-reload >/dev/null || true %endif %endif %files # Core utilities %{_sbindir}/* %{_bindir}/raidz_test %{_sbindir}/zgenhostid %{_bindir}/zvol_wait # Optional Python 3 scripts %{_bindir}/arc_summary %{_bindir}/arcstat %{_bindir}/dbufstat %{_bindir}/zilstat # Man pages %{_mandir}/man1/* %{_mandir}/man4/* %{_mandir}/man5/* %{_mandir}/man7/* %{_mandir}/man8/* # Configuration files and scripts %{_libexecdir}/%{name} %{_udevdir}/vdev_id %{_udevdir}/zvol_id %{_udevdir}/rules.d/* %{_datadir}/%{name}/compatibility.d %if ! 0%{?_systemd} || 0%{?_initramfs} # Files needed for sysvinit and initramfs-tools %{_sysconfdir}/%{name}/zfs-functions %config(noreplace) %{_initconfdir}/zfs %else %exclude %{_sysconfdir}/%{name}/zfs-functions %exclude %{_initconfdir}/zfs %endif %if 0%{?_systemd} %{_unitdir}/* %{_presetdir}/* %{_modulesloaddir}/* %{_systemdgeneratordir}/* %else %config(noreplace) %{_sysconfdir}/init.d/* %endif %config(noreplace) %{_sysconfdir}/%{name}/zed.d/* %config(noreplace) %{_sysconfdir}/%{name}/zpool.d/* %config(noreplace) %{_sysconfdir}/%{name}/vdev_id.conf.*.example %attr(440, root, root) %config(noreplace) %{_sysconfdir}/sudoers.d/* -%config(noreplace) %{_sysconfdir}/bash_completion.d/zfs +%config(noreplace) %{_bashcompletiondir}/zfs %files -n libzpool5 %{_libdir}/libzpool.so.* %files -n libnvpair3 %{_libdir}/libnvpair.so.* %files -n libuutil3 %{_libdir}/libuutil.so.* %files -n libzfs5 %{_libdir}/libzfs*.so.* %files -n libzfs5-devel %{_pkgconfigdir}/libzfs.pc %{_pkgconfigdir}/libzfsbootenv.pc %{_pkgconfigdir}/libzfs_core.pc %{_libdir}/*.so %{_includedir}/* %doc AUTHORS COPYRIGHT LICENSE NOTICE README.md %files test %{_datadir}/%{name}/zfs-tests %{_datadir}/%{name}/test-runner %{_datadir}/%{name}/runfiles %{_datadir}/%{name}/*.sh %files dracut %doc contrib/dracut/README.md %{_dracutdir}/modules.d/* %if %{with pyzfs} %files -n python%{__python_pkg_version}-pyzfs %doc contrib/pyzfs/README %doc contrib/pyzfs/LICENSE %defattr(-,root,root,-) %{__python_sitelib}/libzfs_core/* %{__python_sitelib}/pyzfs* %endif %if 0%{?_initramfs} %files initramfs %doc contrib/initramfs/README.md /usr/share/initramfs-tools/* %else # Since we're not building the initramfs package, # ignore those files. %exclude /usr/share/initramfs-tools %endif %if %{with pam} %files -n pam_zfs_key %{_libdir}/security/* %{_datadir}/pam-configs/* %endif diff --git a/sys/contrib/openzfs/tests/runfiles/common.run b/sys/contrib/openzfs/tests/runfiles/common.run index ef787c65c0f9..1435c55e8fc2 100644 --- a/sys/contrib/openzfs/tests/runfiles/common.run +++ b/sys/contrib/openzfs/tests/runfiles/common.run @@ -1,983 +1,987 @@ # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # This run file contains all of the common functional tests. When # adding a new test consider also adding it to the sanity.run file # if the new test runs to completion in only a few seconds. # # Approximate run time: 4-5 hours # [DEFAULT] pre = setup quiet = False pre_user = root user = root timeout = 600 post_user = root post = cleanup failsafe_user = root failsafe = callbacks/zfs_failsafe outputdir = /var/tmp/test_results tags = ['functional'] [tests/functional/acl/off] tests = ['dosmode', 'posixmode'] tags = ['functional', 'acl'] [tests/functional/alloc_class] tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos', 'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos', 'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos', 'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos', 'alloc_class_013_pos', 'alloc_class_014_neg', 'alloc_class_015_pos'] tags = ['functional', 'alloc_class'] [tests/functional/append] tests = ['file_append', 'threadsappend_001_pos'] tags = ['functional', 'append'] [tests/functional/arc] tests = ['dbufstats_001_pos', 'dbufstats_002_pos', 'dbufstats_003_pos', 'arcstats_runtime_tuning'] tags = ['functional', 'arc'] [tests/functional/atime] tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on'] tags = ['functional', 'atime'] [tests/functional/bootfs] tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos', 'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos', 'bootfs_008_pos'] tags = ['functional', 'bootfs'] [tests/functional/btree] tests = ['btree_positive', 'btree_negative'] tags = ['functional', 'btree'] pre = post = [tests/functional/cache] tests = ['cache_001_pos', 'cache_002_pos', 'cache_003_pos', 'cache_004_neg', 'cache_005_neg', 'cache_006_pos', 'cache_007_neg', 'cache_008_neg', 'cache_009_pos', 'cache_010_pos', 'cache_011_pos', 'cache_012_pos'] tags = ['functional', 'cache'] [tests/functional/cachefile] tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos', 'cachefile_004_pos'] tags = ['functional', 'cachefile'] [tests/functional/casenorm] tests = ['case_all_values', 'norm_all_values', 'mixed_create_failure', 'sensitive_none_lookup', 'sensitive_none_delete', 'sensitive_formd_lookup', 'sensitive_formd_delete', 'insensitive_none_lookup', 'insensitive_none_delete', 'insensitive_formd_lookup', 'insensitive_formd_delete', 'mixed_none_lookup', 'mixed_none_lookup_ci', 'mixed_none_delete', 'mixed_formd_lookup', 'mixed_formd_lookup_ci', 'mixed_formd_delete'] tags = ['functional', 'casenorm'] [tests/functional/channel_program/lua_core] tests = ['tst.args_to_lua', 'tst.divide_by_zero', 'tst.exists', 'tst.integer_illegal', 'tst.integer_overflow', 'tst.language_functions_neg', 'tst.language_functions_pos', 'tst.large_prog', 'tst.libraries', 'tst.memory_limit', 'tst.nested_neg', 'tst.nested_pos', 'tst.nvlist_to_lua', 'tst.recursive_neg', 'tst.recursive_pos', 'tst.return_large', 'tst.return_nvlist_neg', 'tst.return_nvlist_pos', 'tst.return_recursive_table', 'tst.stack_gsub', 'tst.timeout'] tags = ['functional', 'channel_program', 'lua_core'] [tests/functional/channel_program/synctask_core] tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit', 'tst.get_index_props', 'tst.get_mountpoint', 'tst.get_neg', 'tst.get_number_props', 'tst.get_string_props', 'tst.get_type', 'tst.get_userquota', 'tst.get_written', 'tst.inherit', 'tst.list_bookmarks', 'tst.list_children', 'tst.list_clones', 'tst.list_holds', 'tst.list_snapshots', 'tst.list_system_props', 'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict', 'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult', 'tst.rollback_one', 'tst.set_props', 'tst.snapshot_destroy', 'tst.snapshot_neg', 'tst.snapshot_recursive', 'tst.snapshot_rename', 'tst.snapshot_simple', 'tst.bookmark.create', 'tst.bookmark.copy', 'tst.terminate_by_signal' ] tags = ['functional', 'channel_program', 'synctask_core'] [tests/functional/checksum] tests = ['run_edonr_test', 'run_sha2_test', 'run_skein_test', 'run_blake3_test', 'filetest_001_pos', 'filetest_002_pos'] tags = ['functional', 'checksum'] [tests/functional/clean_mirror] tests = [ 'clean_mirror_001_pos', 'clean_mirror_002_pos', 'clean_mirror_003_pos', 'clean_mirror_004_pos'] tags = ['functional', 'clean_mirror'] [tests/functional/cli_root/zdb] tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', 'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos', 'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress', 'zdb_display_block', 'zdb_encrypted', 'zdb_label_checksum', 'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id', 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', 'zdb_backup'] pre = post = tags = ['functional', 'cli_root', 'zdb'] [tests/functional/cli_root/zfs] tests = ['zfs_001_neg', 'zfs_002_pos'] tags = ['functional', 'cli_root', 'zfs'] [tests/functional/cli_root/zfs_bookmark] tests = ['zfs_bookmark_cliargs'] tags = ['functional', 'cli_root', 'zfs_bookmark'] [tests/functional/cli_root/zfs_change-key] tests = ['zfs_change-key', 'zfs_change-key_child', 'zfs_change-key_format', 'zfs_change-key_inherit', 'zfs_change-key_load', 'zfs_change-key_location', 'zfs_change-key_pbkdf2iters', 'zfs_change-key_clones'] tags = ['functional', 'cli_root', 'zfs_change-key'] [tests/functional/cli_root/zfs_clone] tests = ['zfs_clone_001_neg', 'zfs_clone_002_pos', 'zfs_clone_003_pos', 'zfs_clone_004_pos', 'zfs_clone_005_pos', 'zfs_clone_006_pos', 'zfs_clone_007_pos', 'zfs_clone_008_neg', 'zfs_clone_009_neg', 'zfs_clone_010_pos', 'zfs_clone_encrypted', 'zfs_clone_deeply_nested', 'zfs_clone_rm_nested'] tags = ['functional', 'cli_root', 'zfs_clone'] [tests/functional/cli_root/zfs_copies] tests = ['zfs_copies_001_pos', 'zfs_copies_002_pos', 'zfs_copies_003_pos', 'zfs_copies_004_neg', 'zfs_copies_005_neg', 'zfs_copies_006_pos'] tags = ['functional', 'cli_root', 'zfs_copies'] [tests/functional/cli_root/zfs_create] tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos', 'zfs_create_004_pos', 'zfs_create_005_pos', 'zfs_create_006_pos', 'zfs_create_007_pos', 'zfs_create_008_neg', 'zfs_create_009_neg', 'zfs_create_010_neg', 'zfs_create_011_pos', 'zfs_create_012_pos', 'zfs_create_013_pos', 'zfs_create_014_pos', 'zfs_create_encrypted', 'zfs_create_crypt_combos', 'zfs_create_dryrun', 'zfs_create_nomount', 'zfs_create_verbose'] tags = ['functional', 'cli_root', 'zfs_create'] [tests/functional/cli_root/zfs_destroy] tests = ['zfs_clone_livelist_condense_and_disable', 'zfs_clone_livelist_condense_races', 'zfs_clone_livelist_dedup', 'zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos', 'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg', 'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos', 'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos', 'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_015_pos', 'zfs_destroy_016_pos', 'zfs_destroy_clone_livelist', 'zfs_destroy_dev_removal', 'zfs_destroy_dev_removal_condense'] tags = ['functional', 'cli_root', 'zfs_destroy'] [tests/functional/cli_root/zfs_diff] tests = ['zfs_diff_changes', 'zfs_diff_cliargs', 'zfs_diff_timestamp', 'zfs_diff_types', 'zfs_diff_encrypted', 'zfs_diff_mangle'] tags = ['functional', 'cli_root', 'zfs_diff'] [tests/functional/cli_root/zfs_get] tests = ['zfs_get_001_pos', 'zfs_get_002_pos', 'zfs_get_003_pos', 'zfs_get_004_pos', 'zfs_get_005_neg', 'zfs_get_006_neg', 'zfs_get_007_neg', 'zfs_get_008_pos', 'zfs_get_009_pos', 'zfs_get_010_neg'] tags = ['functional', 'cli_root', 'zfs_get'] [tests/functional/cli_root/zfs_ids_to_path] tests = ['zfs_ids_to_path_001_pos'] tags = ['functional', 'cli_root', 'zfs_ids_to_path'] [tests/functional/cli_root/zfs_inherit] tests = ['zfs_inherit_001_neg', 'zfs_inherit_002_neg', 'zfs_inherit_003_pos', 'zfs_inherit_mountpoint'] tags = ['functional', 'cli_root', 'zfs_inherit'] [tests/functional/cli_root/zfs_load-key] tests = ['zfs_load-key', 'zfs_load-key_all', 'zfs_load-key_file', 'zfs_load-key_https', 'zfs_load-key_location', 'zfs_load-key_noop', 'zfs_load-key_recursive'] tags = ['functional', 'cli_root', 'zfs_load-key'] [tests/functional/cli_root/zfs_mount] tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', 'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos', 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted', 'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', 'zfs_mount_test_race'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_program] tests = ['zfs_program_json'] tags = ['functional', 'cli_root', 'zfs_program'] [tests/functional/cli_root/zfs_promote] tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos', 'zfs_promote_004_pos', 'zfs_promote_005_pos', 'zfs_promote_006_neg', 'zfs_promote_007_neg', 'zfs_promote_008_pos', 'zfs_promote_encryptionroot'] tags = ['functional', 'cli_root', 'zfs_promote'] [tests/functional/cli_root/zfs_property] tests = ['zfs_written_property_001_pos'] tags = ['functional', 'cli_root', 'zfs_property'] [tests/functional/cli_root/zfs_receive] tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos', 'zfs_receive_004_neg', 'zfs_receive_005_neg', 'zfs_receive_006_pos', 'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg', 'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos', 'zfs_receive_013_pos', 'zfs_receive_014_pos', 'zfs_receive_015_pos', 'zfs_receive_016_pos', 'receive-o-x_props_override', 'receive-o-x_props_aliases', 'zfs_receive_from_encrypted', 'zfs_receive_to_encrypted', 'zfs_receive_raw', 'zfs_receive_raw_incremental', 'zfs_receive_-e', 'zfs_receive_raw_-d', 'zfs_receive_from_zstd', 'zfs_receive_new_props', 'zfs_receive_-wR-encrypted-mix', 'zfs_receive_corrective', 'zfs_receive_compressed_corrective', 'zfs_receive_large_block_corrective'] tags = ['functional', 'cli_root', 'zfs_receive'] [tests/functional/cli_root/zfs_rename] tests = ['zfs_rename_001_pos', 'zfs_rename_002_pos', 'zfs_rename_003_pos', 'zfs_rename_004_neg', 'zfs_rename_005_neg', 'zfs_rename_006_pos', 'zfs_rename_007_pos', 'zfs_rename_008_pos', 'zfs_rename_009_neg', 'zfs_rename_010_neg', 'zfs_rename_011_pos', 'zfs_rename_012_neg', 'zfs_rename_013_pos', 'zfs_rename_014_neg', 'zfs_rename_encrypted_child', 'zfs_rename_to_encrypted', 'zfs_rename_mountpoint', 'zfs_rename_nounmount'] tags = ['functional', 'cli_root', 'zfs_rename'] [tests/functional/cli_root/zfs_reservation] tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] tags = ['functional', 'cli_root', 'zfs_reservation'] [tests/functional/cli_root/zfs_rollback] tests = ['zfs_rollback_001_pos', 'zfs_rollback_002_pos', 'zfs_rollback_003_neg', 'zfs_rollback_004_neg'] tags = ['functional', 'cli_root', 'zfs_rollback'] [tests/functional/cli_root/zfs_send] tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos', 'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos', 'zfs_send_007_pos', 'zfs_send_encrypted', 'zfs_send_encrypted_unloaded', 'zfs_send_raw', 'zfs_send_sparse', 'zfs_send-b', 'zfs_send_skip_missing'] tags = ['functional', 'cli_root', 'zfs_send'] [tests/functional/cli_root/zfs_set] tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos', 'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos', 'checksum_001_pos', 'compression_001_pos', 'mountpoint_001_pos', 'mountpoint_002_pos', 'reservation_001_neg', 'user_property_002_pos', 'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos', 'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos', 'user_property_004_pos', 'version_001_neg', 'zfs_set_001_neg', 'zfs_set_002_neg', 'zfs_set_003_neg', 'property_alias_001_pos', 'mountpoint_003_pos', 'ro_props_001_pos', 'zfs_set_keylocation', 'zfs_set_feature_activation', 'zfs_set_nomount'] tags = ['functional', 'cli_root', 'zfs_set'] [tests/functional/cli_root/zfs_share] tests = ['zfs_share_001_pos', 'zfs_share_002_pos', 'zfs_share_003_pos', 'zfs_share_004_pos', 'zfs_share_006_pos', 'zfs_share_008_neg', 'zfs_share_010_neg', 'zfs_share_011_pos', 'zfs_share_concurrent_shares'] tags = ['functional', 'cli_root', 'zfs_share'] [tests/functional/cli_root/zfs_snapshot] tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg', 'zfs_snapshot_003_neg', 'zfs_snapshot_004_neg', 'zfs_snapshot_005_neg', 'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg', 'zfs_snapshot_008_neg', 'zfs_snapshot_009_pos'] tags = ['functional', 'cli_root', 'zfs_snapshot'] [tests/functional/cli_root/zfs_unload-key] tests = ['zfs_unload-key', 'zfs_unload-key_all', 'zfs_unload-key_recursive'] tags = ['functional', 'cli_root', 'zfs_unload-key'] [tests/functional/cli_root/zfs_unmount] tests = ['zfs_unmount_001_pos', 'zfs_unmount_002_pos', 'zfs_unmount_003_pos', 'zfs_unmount_004_pos', 'zfs_unmount_005_pos', 'zfs_unmount_006_pos', 'zfs_unmount_007_neg', 'zfs_unmount_008_neg', 'zfs_unmount_009_pos', 'zfs_unmount_all_001_pos', 'zfs_unmount_nested', 'zfs_unmount_unload_keys'] tags = ['functional', 'cli_root', 'zfs_unmount'] [tests/functional/cli_root/zfs_unshare] tests = ['zfs_unshare_001_pos', 'zfs_unshare_002_pos', 'zfs_unshare_003_pos', 'zfs_unshare_004_neg', 'zfs_unshare_005_neg', 'zfs_unshare_006_pos', 'zfs_unshare_007_pos'] tags = ['functional', 'cli_root', 'zfs_unshare'] [tests/functional/cli_root/zfs_upgrade] tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_003_pos', 'zfs_upgrade_004_pos', 'zfs_upgrade_005_pos', 'zfs_upgrade_006_neg', 'zfs_upgrade_007_neg'] tags = ['functional', 'cli_root', 'zfs_upgrade'] [tests/functional/cli_root/zfs_wait] tests = ['zfs_wait_deleteq', 'zfs_wait_getsubopt'] tags = ['functional', 'cli_root', 'zfs_wait'] [tests/functional/cli_root/zhack] tests = ['zhack_label_repair_001', 'zhack_label_repair_002', 'zhack_label_repair_003', 'zhack_label_repair_004'] pre = post = tags = ['functional', 'cli_root', 'zhack'] [tests/functional/cli_root/zpool] tests = ['zpool_001_neg', 'zpool_002_pos', 'zpool_003_pos', 'zpool_colors'] tags = ['functional', 'cli_root', 'zpool'] [tests/functional/cli_root/zpool_add] tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos', 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos', 'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_attach] tests = ['zpool_attach_001_neg', 'attach-o_ashift'] tags = ['functional', 'cli_root', 'zpool_attach'] [tests/functional/cli_root/zpool_clear] tests = ['zpool_clear_001_pos', 'zpool_clear_002_neg', 'zpool_clear_003_neg', 'zpool_clear_readonly'] tags = ['functional', 'cli_root', 'zpool_clear'] [tests/functional/cli_root/zpool_create] tests = ['zpool_create_001_pos', 'zpool_create_002_pos', 'zpool_create_003_pos', 'zpool_create_004_pos', 'zpool_create_005_pos', 'zpool_create_006_pos', 'zpool_create_007_neg', 'zpool_create_008_pos', 'zpool_create_009_neg', 'zpool_create_010_neg', 'zpool_create_011_neg', 'zpool_create_012_neg', 'zpool_create_014_neg', 'zpool_create_015_neg', 'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', 'zpool_create_023_neg', 'zpool_create_024_pos', 'zpool_create_encrypted', 'zpool_create_crypt_combos', 'zpool_create_draid_001_pos', 'zpool_create_draid_002_pos', 'zpool_create_draid_003_pos', 'zpool_create_draid_004_pos', 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', 'zpool_create_features_005_pos', 'zpool_create_features_006_pos', 'zpool_create_features_007_pos', 'zpool_create_features_008_pos', 'zpool_create_features_009_pos', 'create-o_ashift', 'zpool_create_tempname', 'zpool_create_dryrun_output'] tags = ['functional', 'cli_root', 'zpool_create'] [tests/functional/cli_root/zpool_destroy] tests = ['zpool_destroy_001_pos', 'zpool_destroy_002_pos', 'zpool_destroy_003_neg'] pre = post = tags = ['functional', 'cli_root', 'zpool_destroy'] [tests/functional/cli_root/zpool_detach] tests = ['zpool_detach_001_neg'] tags = ['functional', 'cli_root', 'zpool_detach'] [tests/functional/cli_root/zpool_events] tests = ['zpool_events_clear', 'zpool_events_cliargs', 'zpool_events_follow', 'zpool_events_poolname', 'zpool_events_errors', 'zpool_events_duplicates', 'zpool_events_clear_retained'] tags = ['functional', 'cli_root', 'zpool_events'] [tests/functional/cli_root/zpool_export] tests = ['zpool_export_001_pos', 'zpool_export_002_pos', 'zpool_export_003_neg', 'zpool_export_004_pos'] tags = ['functional', 'cli_root', 'zpool_export'] [tests/functional/cli_root/zpool_get] tests = ['zpool_get_001_pos', 'zpool_get_002_pos', 'zpool_get_003_pos', 'zpool_get_004_neg', 'zpool_get_005_pos', 'vdev_get_001_pos'] tags = ['functional', 'cli_root', 'zpool_get'] [tests/functional/cli_root/zpool_history] tests = ['zpool_history_001_neg', 'zpool_history_002_pos'] tags = ['functional', 'cli_root', 'zpool_history'] [tests/functional/cli_root/zpool_import] tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'zpool_import_003_pos', 'zpool_import_004_pos', 'zpool_import_005_pos', 'zpool_import_006_pos', 'zpool_import_007_pos', 'zpool_import_008_pos', 'zpool_import_009_neg', 'zpool_import_010_pos', 'zpool_import_011_neg', 'zpool_import_012_pos', 'zpool_import_013_neg', 'zpool_import_014_pos', 'zpool_import_015_pos', 'zpool_import_016_pos', 'zpool_import_017_pos', 'zpool_import_features_001_pos', 'zpool_import_features_002_neg', 'zpool_import_features_003_pos', 'zpool_import_missing_001_pos', 'zpool_import_missing_002_pos', 'zpool_import_missing_003_pos', 'zpool_import_rename_001_pos', 'zpool_import_all_001_pos', 'zpool_import_encrypted', 'zpool_import_encrypted_load', 'zpool_import_errata3', 'zpool_import_errata4', + 'zpool_import_hostid_changed', + 'zpool_import_hostid_changed_unclean_export', + 'zpool_import_hostid_changed_cachefile', + 'zpool_import_hostid_changed_cachefile_unclean_export', 'import_cachefile_device_added', 'import_cachefile_device_removed', 'import_cachefile_device_replaced', 'import_cachefile_mirror_attached', 'import_cachefile_mirror_detached', 'import_cachefile_paths_changed', 'import_cachefile_shared_device', 'import_devices_missing', 'import_log_missing', 'import_paths_changed', 'import_rewind_config_changed', 'import_rewind_device_replaced'] tags = ['functional', 'cli_root', 'zpool_import'] timeout = 1200 [tests/functional/cli_root/zpool_labelclear] tests = ['zpool_labelclear_active', 'zpool_labelclear_exported', 'zpool_labelclear_removed', 'zpool_labelclear_valid'] pre = post = tags = ['functional', 'cli_root', 'zpool_labelclear'] [tests/functional/cli_root/zpool_initialize] tests = ['zpool_initialize_attach_detach_add_remove', 'zpool_initialize_fault_export_import_online', 'zpool_initialize_import_export', 'zpool_initialize_offline_export_import_online', 'zpool_initialize_online_offline', 'zpool_initialize_split', 'zpool_initialize_start_and_cancel_neg', 'zpool_initialize_start_and_cancel_pos', 'zpool_initialize_suspend_resume', 'zpool_initialize_uninit', 'zpool_initialize_unsupported_vdevs', 'zpool_initialize_verify_checksums', 'zpool_initialize_verify_initialized'] pre = tags = ['functional', 'cli_root', 'zpool_initialize'] [tests/functional/cli_root/zpool_offline] tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg', 'zpool_offline_003_pos'] tags = ['functional', 'cli_root', 'zpool_offline'] [tests/functional/cli_root/zpool_online] tests = ['zpool_online_001_pos', 'zpool_online_002_neg'] tags = ['functional', 'cli_root', 'zpool_online'] [tests/functional/cli_root/zpool_remove] tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos', 'zpool_remove_003_pos'] tags = ['functional', 'cli_root', 'zpool_remove'] [tests/functional/cli_root/zpool_replace] tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift'] tags = ['functional', 'cli_root', 'zpool_replace'] [tests/functional/cli_root/zpool_resilver] tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart', 'zpool_resilver_concurrent'] tags = ['functional', 'cli_root', 'zpool_resilver'] [tests/functional/cli_root/zpool_scrub] tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', 'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing', 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies', 'zpool_error_scrub_001_pos', 'zpool_error_scrub_002_pos', 'zpool_error_scrub_003_pos', 'zpool_error_scrub_004_pos'] tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', 'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos', 'user_property_001_pos', 'user_property_002_neg'] tags = ['functional', 'cli_root', 'zpool_set'] [tests/functional/cli_root/zpool_split] tests = ['zpool_split_cliargs', 'zpool_split_devices', 'zpool_split_encryption', 'zpool_split_props', 'zpool_split_vdevs', 'zpool_split_resilver', 'zpool_split_indirect', 'zpool_split_dryrun_output'] tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/cli_root/zpool_status] tests = ['zpool_status_001_pos', 'zpool_status_002_pos', 'zpool_status_003_pos', 'zpool_status_004_pos', 'zpool_status_005_pos', 'zpool_status_006_pos', 'zpool_status_007_pos', 'zpool_status_features_001_pos'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] tests = ['zpool_sync_001_pos', 'zpool_sync_002_neg'] tags = ['functional', 'cli_root', 'zpool_sync'] [tests/functional/cli_root/zpool_trim] tests = ['zpool_trim_attach_detach_add_remove', 'zpool_trim_fault_export_import_online', 'zpool_trim_import_export', 'zpool_trim_multiple', 'zpool_trim_neg', 'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline', 'zpool_trim_partial', 'zpool_trim_rate', 'zpool_trim_rate_neg', 'zpool_trim_secure', 'zpool_trim_split', 'zpool_trim_start_and_cancel_neg', 'zpool_trim_start_and_cancel_pos', 'zpool_trim_suspend_resume', 'zpool_trim_unsupported_vdevs', 'zpool_trim_verify_checksums', 'zpool_trim_verify_trimmed'] tags = ['functional', 'zpool_trim'] [tests/functional/cli_root/zpool_upgrade] tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_002_pos', 'zpool_upgrade_003_pos', 'zpool_upgrade_004_pos', 'zpool_upgrade_005_neg', 'zpool_upgrade_006_neg', 'zpool_upgrade_007_pos', 'zpool_upgrade_008_pos', 'zpool_upgrade_009_neg', 'zpool_upgrade_features_001_pos'] tags = ['functional', 'cli_root', 'zpool_upgrade'] [tests/functional/cli_root/zpool_wait] tests = ['zpool_wait_discard', 'zpool_wait_freeing', 'zpool_wait_initialize_basic', 'zpool_wait_initialize_cancel', 'zpool_wait_initialize_flag', 'zpool_wait_multiple', 'zpool_wait_no_activity', 'zpool_wait_remove', 'zpool_wait_remove_cancel', 'zpool_wait_trim_basic', 'zpool_wait_trim_cancel', 'zpool_wait_trim_flag', 'zpool_wait_usage'] tags = ['functional', 'cli_root', 'zpool_wait'] [tests/functional/cli_root/zpool_wait/scan] tests = ['zpool_wait_replace_cancel', 'zpool_wait_rebuild', 'zpool_wait_resilver', 'zpool_wait_scrub_cancel', 'zpool_wait_replace', 'zpool_wait_scrub_basic', 'zpool_wait_scrub_flag'] tags = ['functional', 'cli_root', 'zpool_wait'] [tests/functional/cli_user/misc] tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', 'zfs_clone_001_neg', 'zfs_create_001_neg', 'zfs_destroy_001_neg', 'zfs_get_001_neg', 'zfs_inherit_001_neg', 'zfs_mount_001_neg', 'zfs_promote_001_neg', 'zfs_receive_001_neg', 'zfs_rename_001_neg', 'zfs_rollback_001_neg', 'zfs_send_001_neg', 'zfs_set_001_neg', 'zfs_share_001_neg', 'zfs_snapshot_001_neg', 'zfs_unallow_001_neg', 'zfs_unmount_001_neg', 'zfs_unshare_001_neg', 'zfs_upgrade_001_neg', 'zpool_001_neg', 'zpool_add_001_neg', 'zpool_attach_001_neg', 'zpool_clear_001_neg', 'zpool_create_001_neg', 'zpool_destroy_001_neg', 'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg', 'zpool_history_001_neg', 'zpool_import_001_neg', 'zpool_import_002_neg', 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', 'zpool_replace_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', 'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege', 'zilstat_001_pos'] user = tags = ['functional', 'cli_user', 'misc'] [tests/functional/cli_user/zfs_list] tests = ['zfs_list_001_pos', 'zfs_list_002_pos', 'zfs_list_003_pos', 'zfs_list_004_neg', 'zfs_list_005_neg', 'zfs_list_007_pos', 'zfs_list_008_neg'] user = tags = ['functional', 'cli_user', 'zfs_list'] [tests/functional/cli_user/zpool_iostat] tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', 'zpool_iostat_003_neg', 'zpool_iostat_004_pos', 'zpool_iostat_005_pos', 'zpool_iostat_-c_disable', 'zpool_iostat_-c_homedir', 'zpool_iostat_-c_searchpath'] user = tags = ['functional', 'cli_user', 'zpool_iostat'] [tests/functional/cli_user/zpool_list] tests = ['zpool_list_001_pos', 'zpool_list_002_neg'] user = tags = ['functional', 'cli_user', 'zpool_list'] [tests/functional/cli_user/zpool_status] tests = ['zpool_status_003_pos', 'zpool_status_-c_disable', 'zpool_status_-c_homedir', 'zpool_status_-c_searchpath'] user = tags = ['functional', 'cli_user', 'zpool_status'] [tests/functional/compression] tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos', 'l2arc_compressed_arc', 'l2arc_compressed_arc_disabled', 'l2arc_encrypted', 'l2arc_encrypted_no_compressed_arc'] tags = ['functional', 'compression'] [tests/functional/cp_files] tests = ['cp_files_001_pos'] tags = ['functional', 'cp_files'] [tests/functional/crtime] tests = ['crtime_001_pos' ] tags = ['functional', 'crtime'] [tests/functional/ctime] tests = ['ctime_001_pos' ] tags = ['functional', 'ctime'] [tests/functional/deadman] tests = ['deadman_ratelimit', 'deadman_sync', 'deadman_zio'] pre = post = tags = ['functional', 'deadman'] [tests/functional/delegate] tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos', 'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos', 'zfs_allow_007_pos', 'zfs_allow_008_pos', 'zfs_allow_009_neg', 'zfs_allow_010_pos', 'zfs_allow_011_neg', 'zfs_allow_012_neg', 'zfs_unallow_001_pos', 'zfs_unallow_002_pos', 'zfs_unallow_003_pos', 'zfs_unallow_004_pos', 'zfs_unallow_005_pos', 'zfs_unallow_006_pos', 'zfs_unallow_007_neg', 'zfs_unallow_008_neg'] tags = ['functional', 'delegate'] [tests/functional/exec] tests = ['exec_001_pos', 'exec_002_neg'] tags = ['functional', 'exec'] [tests/functional/fallocate] tests = ['fallocate_punch-hole'] tags = ['functional', 'fallocate'] [tests/functional/features/async_destroy] tests = ['async_destroy_001_pos'] tags = ['functional', 'features', 'async_destroy'] [tests/functional/features/large_dnode] tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_007_neg', 'large_dnode_009_pos'] tags = ['functional', 'features', 'large_dnode'] [tests/functional/grow] pre = post = tests = ['grow_pool_001_pos', 'grow_replicas_001_pos'] tags = ['functional', 'grow'] [tests/functional/history] tests = ['history_001_pos', 'history_002_pos', 'history_003_pos', 'history_004_pos', 'history_005_neg', 'history_006_neg', 'history_007_pos', 'history_008_pos', 'history_009_pos', 'history_010_pos'] tags = ['functional', 'history'] [tests/functional/hkdf] pre = post = tests = ['hkdf_test'] tags = ['functional', 'hkdf'] [tests/functional/inheritance] tests = ['inherit_001_pos'] pre = tags = ['functional', 'inheritance'] [tests/functional/io] tests = ['sync', 'psync', 'posixaio', 'mmap'] tags = ['functional', 'io'] [tests/functional/inuse] tests = ['inuse_004_pos', 'inuse_005_pos', 'inuse_008_pos', 'inuse_009_pos'] post = tags = ['functional', 'inuse'] [tests/functional/large_files] tests = ['large_files_001_pos', 'large_files_002_pos'] tags = ['functional', 'large_files'] [tests/functional/limits] tests = ['filesystem_count', 'filesystem_limit', 'snapshot_count', 'snapshot_limit'] tags = ['functional', 'limits'] [tests/functional/link_count] tests = ['link_count_001', 'link_count_root_inode'] tags = ['functional', 'link_count'] [tests/functional/migration] tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos', 'migration_004_pos', 'migration_005_pos', 'migration_006_pos', 'migration_007_pos', 'migration_008_pos', 'migration_009_pos', 'migration_010_pos', 'migration_011_pos', 'migration_012_pos'] tags = ['functional', 'migration'] [tests/functional/mmap] tests = ['mmap_mixed', 'mmap_read_001_pos', 'mmap_seek_001_pos', 'mmap_sync_001_pos', 'mmap_write_001_pos'] tags = ['functional', 'mmap'] [tests/functional/mount] tests = ['umount_001', 'umountall_001'] tags = ['functional', 'mount'] [tests/functional/mv_files] tests = ['mv_files_001_pos', 'mv_files_002_pos', 'random_creation'] tags = ['functional', 'mv_files'] [tests/functional/nestedfs] tests = ['nestedfs_001_pos'] tags = ['functional', 'nestedfs'] [tests/functional/no_space] tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos', 'enospc_df', 'enospc_ganging', 'enospc_rm'] tags = ['functional', 'no_space'] [tests/functional/nopwrite] tests = ['nopwrite_copies', 'nopwrite_mtime', 'nopwrite_negative', 'nopwrite_promoted_clone', 'nopwrite_recsize', 'nopwrite_sync', 'nopwrite_varying_compression', 'nopwrite_volume'] tags = ['functional', 'nopwrite'] [tests/functional/online_offline] tests = ['online_offline_001_pos', 'online_offline_002_neg', 'online_offline_003_neg'] tags = ['functional', 'online_offline'] [tests/functional/pool_checkpoint] tests = ['checkpoint_after_rewind', 'checkpoint_big_rewind', 'checkpoint_capacity', 'checkpoint_conf_change', 'checkpoint_discard', 'checkpoint_discard_busy', 'checkpoint_discard_many', 'checkpoint_indirect', 'checkpoint_invalid', 'checkpoint_lun_expsz', 'checkpoint_open', 'checkpoint_removal', 'checkpoint_rewind', 'checkpoint_ro_rewind', 'checkpoint_sm_scale', 'checkpoint_twice', 'checkpoint_vdev_add', 'checkpoint_zdb', 'checkpoint_zhack_feat'] tags = ['functional', 'pool_checkpoint'] timeout = 1800 [tests/functional/pool_names] tests = ['pool_names_001_pos', 'pool_names_002_neg'] pre = post = tags = ['functional', 'pool_names'] [tests/functional/poolversion] tests = ['poolversion_001_pos', 'poolversion_002_pos'] tags = ['functional', 'poolversion'] [tests/functional/pyzfs] tests = ['pyzfs_unittest'] pre = post = tags = ['functional', 'pyzfs'] [tests/functional/quota] tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos', 'quota_004_pos', 'quota_005_pos', 'quota_006_neg'] tags = ['functional', 'quota'] [tests/functional/redacted_send] tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted', 'redacted_disabled_feature', 'redacted_embedded', 'redacted_holes', 'redacted_incrementals', 'redacted_largeblocks', 'redacted_many_clones', 'redacted_mixed_recsize', 'redacted_mounts', 'redacted_negative', 'redacted_origin', 'redacted_panic', 'redacted_props', 'redacted_resume', 'redacted_size', 'redacted_volume'] tags = ['functional', 'redacted_send'] [tests/functional/raidz] tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos'] tags = ['functional', 'raidz'] [tests/functional/redundancy] tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2', 'redundancy_draid3', 'redundancy_draid_damaged1', 'redundancy_draid_damaged2', 'redundancy_draid_spare1', 'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror', 'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2', 'redundancy_raidz3', 'redundancy_stripe'] tags = ['functional', 'redundancy'] timeout = 1200 [tests/functional/refquota] tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos', 'refquota_004_pos', 'refquota_005_pos', 'refquota_006_neg', 'refquota_007_neg', 'refquota_008_neg'] tags = ['functional', 'refquota'] [tests/functional/refreserv] tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', 'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_multi_raidz', 'refreserv_raidz'] tags = ['functional', 'refreserv'] [tests/functional/removal] pre = tests = ['removal_all_vdev', 'removal_cancel', 'removal_check_space', 'removal_condense_export', 'removal_multiple_indirection', 'removal_nopwrite', 'removal_remap_deadlists', 'removal_resume_export', 'removal_sanity', 'removal_with_add', 'removal_with_create_fs', 'removal_with_dedup', 'removal_with_errors', 'removal_with_export', 'removal_with_indirect', 'removal_with_ganging', 'removal_with_faulted', 'removal_with_remove', 'removal_with_scrub', 'removal_with_send', 'removal_with_send_recv', 'removal_with_snapshot', 'removal_with_write', 'removal_with_zdb', 'remove_expanded', 'remove_mirror', 'remove_mirror_sanity', 'remove_raidz', 'remove_indirect', 'remove_attach_mirror', 'removal_reservation'] tags = ['functional', 'removal'] [tests/functional/rename_dirs] tests = ['rename_dirs_001_pos'] tags = ['functional', 'rename_dirs'] [tests/functional/replacement] tests = ['attach_import', 'attach_multiple', 'attach_rebuild', 'attach_resilver', 'detach', 'rebuild_disabled_feature', 'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild', 'replace_resilver', 'resilver_restart_001', 'resilver_restart_002', 'scrub_cancel'] tags = ['functional', 'replacement'] [tests/functional/reservation] tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos', 'reservation_004_pos', 'reservation_005_pos', 'reservation_006_pos', 'reservation_007_pos', 'reservation_008_pos', 'reservation_009_pos', 'reservation_010_pos', 'reservation_011_pos', 'reservation_012_pos', 'reservation_013_pos', 'reservation_014_pos', 'reservation_015_pos', 'reservation_016_pos', 'reservation_017_pos', 'reservation_018_pos', 'reservation_019_pos', 'reservation_020_pos', 'reservation_021_neg', 'reservation_022_pos'] tags = ['functional', 'reservation'] [tests/functional/rootpool] tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_pos'] tags = ['functional', 'rootpool'] [tests/functional/rsend] tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'rsend_005_pos', 'rsend_006_pos', 'rsend_007_pos', 'rsend_008_pos', 'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos', 'rsend_013_pos', 'rsend_014_pos', 'rsend_016_neg', 'rsend_019_pos', 'rsend_020_pos', 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos', 'rsend_025_pos', 'rsend_026_neg', 'rsend_027_pos', 'rsend_028_neg', 'rsend_029_neg', 'rsend_030_pos', 'rsend_031_pos', 'send-c_verify_ratio', 'send-c_verify_contents', 'send-c_props', 'send-c_incremental', 'send-c_volume', 'send-c_zstream_recompress', 'send-c_zstreamdump', 'send-c_lz4_disabled', 'send-c_recv_lz4_disabled', 'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize', 'send-c_recv_dedup', 'send-L_toggle', 'send_encrypted_incremental', 'send_encrypted_freeobjects', 'send_encrypted_hierarchy', 'send_encrypted_props', 'send_encrypted_truncated_files', 'send_freeobjects', 'send_realloc_files', 'send_realloc_encrypted_files', 'send_spill_block', 'send_holds', 'send_hole_birth', 'send_mixed_raw', 'send-wR_encrypted_zvol', 'send_partial_dataset', 'send_invalid', 'send_doall', 'send_raw_spill_block', 'send_raw_ashift', 'send_raw_large_blocks'] tags = ['functional', 'rsend'] [tests/functional/scrub_mirror] tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos', 'scrub_mirror_003_pos', 'scrub_mirror_004_pos'] tags = ['functional', 'scrub_mirror'] [tests/functional/slog] tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos', 'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg', 'slog_009_neg', 'slog_010_neg', 'slog_011_neg', 'slog_012_neg', 'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs_001', 'slog_replay_fs_002', 'slog_replay_volume', 'slog_016_pos'] tags = ['functional', 'slog'] [tests/functional/snapshot] tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos', 'rollback_003_pos', 'snapshot_001_pos', 'snapshot_002_pos', 'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos', 'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos', 'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos', 'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos', 'snapshot_017_pos', 'snapshot_018_pos'] tags = ['functional', 'snapshot'] [tests/functional/snapused] tests = ['snapused_001_pos', 'snapused_002_pos', 'snapused_003_pos', 'snapused_004_pos', 'snapused_005_pos'] tags = ['functional', 'snapused'] [tests/functional/sparse] tests = ['sparse_001_pos'] tags = ['functional', 'sparse'] [tests/functional/stat] tests = ['stat_001_pos'] tags = ['functional', 'stat'] [tests/functional/suid] tests = ['suid_write_to_suid', 'suid_write_to_sgid', 'suid_write_to_suid_sgid', 'suid_write_to_none', 'suid_write_zil_replay'] tags = ['functional', 'suid'] [tests/functional/trim] tests = ['autotrim_integrity', 'autotrim_config', 'autotrim_trim_integrity', 'trim_integrity', 'trim_config', 'trim_l2arc'] tags = ['functional', 'trim'] [tests/functional/truncate] tests = ['truncate_001_pos', 'truncate_002_pos', 'truncate_timestamps'] tags = ['functional', 'truncate'] [tests/functional/upgrade] tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool'] tags = ['functional', 'upgrade'] [tests/functional/userquota] tests = [ 'userquota_001_pos', 'userquota_002_pos', 'userquota_003_pos', 'userquota_004_pos', 'userquota_005_neg', 'userquota_006_pos', 'userquota_007_pos', 'userquota_008_pos', 'userquota_009_pos', 'userquota_010_pos', 'userquota_011_pos', 'userquota_012_neg', 'userspace_001_pos', 'userspace_002_pos', 'userspace_encrypted', 'userspace_send_encrypted', 'userspace_encrypted_13709'] tags = ['functional', 'userquota'] [tests/functional/vdev_zaps] tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos', 'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos', 'vdev_zaps_007_pos'] tags = ['functional', 'vdev_zaps'] [tests/functional/write_dirs] tests = ['write_dirs_001_pos', 'write_dirs_002_pos'] tags = ['functional', 'write_dirs'] [tests/functional/xattr] tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos', 'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg', 'xattr_011_pos', 'xattr_012_pos', 'xattr_013_pos', 'xattr_compat'] tags = ['functional', 'xattr'] [tests/functional/zvol/zvol_ENOSPC] tests = ['zvol_ENOSPC_001_pos'] tags = ['functional', 'zvol', 'zvol_ENOSPC'] [tests/functional/zvol/zvol_cli] tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg'] tags = ['functional', 'zvol', 'zvol_cli'] [tests/functional/zvol/zvol_misc] tests = ['zvol_misc_002_pos', 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse', 'zvol_misc_snapdev', 'zvol_misc_trim', 'zvol_misc_volmode', 'zvol_misc_zil'] tags = ['functional', 'zvol', 'zvol_misc'] [tests/functional/zvol/zvol_stress] tests = ['zvol_stress'] tags = ['functional', 'zvol', 'zvol_stress'] [tests/functional/zvol/zvol_swap] tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_004_pos'] tags = ['functional', 'zvol', 'zvol_swap'] [tests/functional/libzfs] tests = ['many_fds', 'libzfs_input'] tags = ['functional', 'libzfs'] [tests/functional/log_spacemap] tests = ['log_spacemap_import_logs'] pre = post = tags = ['functional', 'log_spacemap'] [tests/functional/l2arc] tests = ['l2arc_arcstats_pos', 'l2arc_mfuonly_pos', 'l2arc_l2miss_pos', 'persist_l2arc_001_pos', 'persist_l2arc_002_pos', 'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos'] tags = ['functional', 'l2arc'] [tests/functional/zpool_influxdb] tests = ['zpool_influxdb'] tags = ['functional', 'zpool_influxdb'] diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am index 3272a5d5816f..158401e078aa 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am @@ -1,2069 +1,2073 @@ CLEANFILES = dist_noinst_DATA = include $(top_srcdir)/config/Substfiles.am datadir_zfs_tests_testsdir = $(datadir)/$(PACKAGE)/zfs-tests/tests nobase_dist_datadir_zfs_tests_tests_DATA = \ perf/nfs-sample.cfg \ perf/perf.shlib \ \ perf/fio/mkfiles.fio \ perf/fio/random_reads.fio \ perf/fio/random_readwrite.fio \ perf/fio/random_readwrite_fixed.fio \ perf/fio/random_writes.fio \ perf/fio/sequential_reads.fio \ perf/fio/sequential_readwrite.fio \ perf/fio/sequential_writes.fio nobase_dist_datadir_zfs_tests_tests_SCRIPTS = \ perf/regression/random_reads.ksh \ perf/regression/random_readwrite.ksh \ perf/regression/random_readwrite_fixed.ksh \ perf/regression/random_writes.ksh \ perf/regression/random_writes_zil.ksh \ perf/regression/sequential_reads_arc_cached_clone.ksh \ perf/regression/sequential_reads_arc_cached.ksh \ perf/regression/sequential_reads_dbuf_cached.ksh \ perf/regression/sequential_reads.ksh \ perf/regression/sequential_writes.ksh \ perf/regression/setup.ksh \ \ perf/scripts/prefetch_io.sh # These lists can be regenerated by running make regen-tests at the root, or, on a *clean* source: # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -executable -name '*.in' | sort | sed 's/\.in$//;s/^/\t/;$!s/$/ \\/' # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' -executable -name '*.in' | sort | sed 's/\.in$//;s/^/\t/;$!s/$/ \\/' # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -name '*.in' ! -name '*.c' | grep -Fe /simd -e /tmpfile | sort | sed 's/^/\t/;$!s/$/ \\/' # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -executable ! -name '*.in' ! -name '*.c' | grep -vFe /simd -e /tmpfile | sort | sed 's/^/\t/;$!s/$/ \\/' # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' -executable ! -name '*.in' ! -name '*.c' | grep -vFe /simd -e /tmpfile | sort | sed 's/^/\t/;$!s/$/ \\/' # # simd and tmpfile are Linux-only and not installed elsewhere # # C programs are specced in ../Makefile.am above as part of the main Makefile find_common := find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' regen: @$(MAKE) -C $(top_builddir) clean @$(MAKE) clean $(SED) $(ac_inplace) '/^# -- >8 --/q' Makefile.am echo >> Makefile.am echo 'nobase_nodist_datadir_zfs_tests_tests_DATA = \' >> Makefile.am $(find_common) ! -executable -name '*.in' | sort | sed 's/\.in$$//;s/^/\t/;$$!s/$$/ \\/' >> Makefile.am echo 'nobase_nodist_datadir_zfs_tests_tests_SCRIPTS = \' >> Makefile.am $(find_common) -executable -name '*.in' | sort | sed 's/\.in$$//;s/^/\t/;$$!s/$$/ \\/' >> Makefile.am echo >> Makefile.am echo 'SUBSTFILES += $$(nobase_nodist_datadir_zfs_tests_tests_DATA) $$(nobase_nodist_datadir_zfs_tests_tests_SCRIPTS)' >> Makefile.am echo >> Makefile.am echo 'if BUILD_LINUX' >> Makefile.am echo 'nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \' >> Makefile.am $(find_common) ! -name '*.in' ! -name '*.c' | grep -Fe /simd -e /tmpfile | sort | sed 's/^/\t/;$$!s/$$/ \\/' >> Makefile.am echo 'endif' >> Makefile.am echo >> Makefile.am echo 'nobase_dist_datadir_zfs_tests_tests_DATA += \' >> Makefile.am $(find_common) ! -executable ! -name '*.in' ! -name '*.c' | grep -vFe /simd -e /tmpfile | sort | sed 's/^/\t/;$$!s/$$/ \\/' >> Makefile.am echo >> Makefile.am echo 'nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \' >> Makefile.am $(find_common) -executable ! -name '*.in' ! -name '*.c' | grep -vFe /simd -e /tmpfile | sort | sed 's/^/\t/;$$!s/$$/ \\/' >> Makefile.am # -- >8 -- nobase_nodist_datadir_zfs_tests_tests_DATA = \ functional/pam/utilities.kshlib nobase_nodist_datadir_zfs_tests_tests_SCRIPTS = \ functional/pyzfs/pyzfs_unittest.ksh SUBSTFILES += $(nobase_nodist_datadir_zfs_tests_tests_DATA) $(nobase_nodist_datadir_zfs_tests_tests_SCRIPTS) if BUILD_LINUX nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/simd/simd_supported.ksh \ functional/tmpfile/cleanup.ksh \ functional/tmpfile/setup.ksh endif nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/acl/acl.cfg \ functional/acl/acl_common.kshlib \ functional/alloc_class/alloc_class.cfg \ functional/alloc_class/alloc_class.kshlib \ functional/atime/atime.cfg \ functional/atime/atime_common.kshlib \ functional/block_cloning/block_cloning.kshlib \ functional/cache/cache.cfg \ functional/cache/cache.kshlib \ functional/cachefile/cachefile.cfg \ functional/cachefile/cachefile.kshlib \ functional/casenorm/casenorm.cfg \ functional/casenorm/casenorm.kshlib \ functional/channel_program/channel_common.kshlib \ functional/channel_program/lua_core/tst.args_to_lua.out \ functional/channel_program/lua_core/tst.args_to_lua.zcp \ functional/channel_program/lua_core/tst.divide_by_zero.err \ functional/channel_program/lua_core/tst.divide_by_zero.zcp \ functional/channel_program/lua_core/tst.exists.zcp \ functional/channel_program/lua_core/tst.large_prog.out \ functional/channel_program/lua_core/tst.large_prog.zcp \ functional/channel_program/lua_core/tst.lib_base.lua \ functional/channel_program/lua_core/tst.lib_coroutine.lua \ functional/channel_program/lua_core/tst.lib_strings.lua \ functional/channel_program/lua_core/tst.lib_table.lua \ functional/channel_program/lua_core/tst.nested_neg.zcp \ functional/channel_program/lua_core/tst.nested_pos.zcp \ functional/channel_program/lua_core/tst.recursive.zcp \ functional/channel_program/lua_core/tst.return_large.zcp \ functional/channel_program/lua_core/tst.return_recursive_table.zcp \ functional/channel_program/lua_core/tst.stack_gsub.err \ functional/channel_program/lua_core/tst.stack_gsub.zcp \ functional/channel_program/lua_core/tst.timeout.zcp \ functional/channel_program/synctask_core/tst.bookmark.copy.zcp \ functional/channel_program/synctask_core/tst.bookmark.create.zcp \ functional/channel_program/synctask_core/tst.get_index_props.out \ functional/channel_program/synctask_core/tst.get_index_props.zcp \ functional/channel_program/synctask_core/tst.get_number_props.out \ functional/channel_program/synctask_core/tst.get_number_props.zcp \ functional/channel_program/synctask_core/tst.get_string_props.out \ functional/channel_program/synctask_core/tst.get_string_props.zcp \ functional/channel_program/synctask_core/tst.promote_conflict.zcp \ functional/channel_program/synctask_core/tst.set_props.zcp \ functional/channel_program/synctask_core/tst.snapshot_destroy.zcp \ functional/channel_program/synctask_core/tst.snapshot_neg.zcp \ functional/channel_program/synctask_core/tst.snapshot_recursive.zcp \ functional/channel_program/synctask_core/tst.snapshot_rename.zcp \ functional/channel_program/synctask_core/tst.snapshot_simple.zcp \ functional/checksum/default.cfg \ functional/clean_mirror/clean_mirror_common.kshlib \ functional/clean_mirror/default.cfg \ functional/cli_root/cli_common.kshlib \ functional/cli_root/zfs_copies/zfs_copies.cfg \ functional/cli_root/zfs_copies/zfs_copies.kshlib \ functional/cli_root/zfs_create/properties.kshlib \ functional/cli_root/zfs_create/zfs_create.cfg \ functional/cli_root/zfs_create/zfs_create_common.kshlib \ functional/cli_root/zfs_destroy/zfs_destroy.cfg \ functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib \ functional/cli_root/zfs_get/zfs_get_common.kshlib \ functional/cli_root/zfs_get/zfs_get_list_d.kshlib \ functional/cli_root/zfs_jail/jail.conf \ functional/cli_root/zfs_load-key/HEXKEY \ functional/cli_root/zfs_load-key/PASSPHRASE \ functional/cli_root/zfs_load-key/RAWKEY \ functional/cli_root/zfs_load-key/zfs_load-key.cfg \ functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib \ functional/cli_root/zfs_mount/zfs_mount.cfg \ functional/cli_root/zfs_mount/zfs_mount.kshlib \ functional/cli_root/zfs_promote/zfs_promote.cfg \ functional/cli_root/zfs_receive/zstd_test_data.txt \ functional/cli_root/zfs_rename/zfs_rename.cfg \ functional/cli_root/zfs_rename/zfs_rename.kshlib \ functional/cli_root/zfs_rollback/zfs_rollback.cfg \ functional/cli_root/zfs_rollback/zfs_rollback_common.kshlib \ functional/cli_root/zfs_send/zfs_send.cfg \ functional/cli_root/zfs_set/zfs_set_common.kshlib \ functional/cli_root/zfs_share/zfs_share.cfg \ functional/cli_root/zfs_snapshot/zfs_snapshot.cfg \ functional/cli_root/zfs_unmount/zfs_unmount.cfg \ functional/cli_root/zfs_unmount/zfs_unmount.kshlib \ functional/cli_root/zfs_upgrade/zfs_upgrade.kshlib \ functional/cli_root/zfs_wait/zfs_wait.kshlib \ functional/cli_root/zpool_add/zpool_add.cfg \ functional/cli_root/zpool_add/zpool_add.kshlib \ functional/cli_root/zpool_clear/zpool_clear.cfg \ functional/cli_root/zpool_create/draidcfg.gz \ functional/cli_root/zpool_create/zpool_create.cfg \ functional/cli_root/zpool_create/zpool_create.shlib \ functional/cli_root/zpool_destroy/zpool_destroy.cfg \ functional/cli_root/zpool_events/zpool_events.cfg \ functional/cli_root/zpool_events/zpool_events.kshlib \ functional/cli_root/zpool_expand/zpool_expand.cfg \ functional/cli_root/zpool_export/zpool_export.cfg \ functional/cli_root/zpool_export/zpool_export.kshlib \ functional/cli_root/zpool_get/vdev_get.cfg \ functional/cli_root/zpool_get/zpool_get.cfg \ functional/cli_root/zpool_get/zpool_get_parsable.cfg \ functional/cli_root/zpool_import/blockfiles/cryptv0.dat.bz2 \ functional/cli_root/zpool_import/blockfiles/missing_ivset.dat.bz2 \ functional/cli_root/zpool_import/blockfiles/unclean_export.dat.bz2 \ functional/cli_root/zpool_import/zpool_import.cfg \ functional/cli_root/zpool_import/zpool_import.kshlib \ functional/cli_root/zpool_initialize/zpool_initialize.kshlib \ functional/cli_root/zpool_labelclear/labelclear.cfg \ functional/cli_root/zpool_remove/zpool_remove.cfg \ functional/cli_root/zpool_reopen/zpool_reopen.cfg \ functional/cli_root/zpool_reopen/zpool_reopen.shlib \ functional/cli_root/zpool_resilver/zpool_resilver.cfg \ functional/cli_root/zpool_scrub/zpool_scrub.cfg \ functional/cli_root/zpool_split/zpool_split.cfg \ functional/cli_root/zpool_trim/zpool_trim.kshlib \ functional/cli_root/zpool_upgrade/blockfiles/zfs-broken-mirror1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-broken-mirror2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v10.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v11.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v12.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v13.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v14.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v15.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz21.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz22.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz23.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v4.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v5.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v6.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v7.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v8.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v999.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v9.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-vBROKEN.dat.bz2 \ functional/cli_root/zpool_upgrade/zpool_upgrade.cfg \ functional/cli_root/zpool_upgrade/zpool_upgrade.kshlib \ functional/cli_root/zpool_wait/zpool_wait.kshlib \ functional/cli_root/zhack/library.kshlib \ functional/cli_user/misc/misc.cfg \ functional/cli_user/zfs_list/zfs_list.cfg \ functional/cli_user/zfs_list/zfs_list.kshlib \ functional/compression/compress.cfg \ functional/compression/testpool_zstd.tar.gz \ functional/deadman/deadman.cfg \ functional/delegate/delegate.cfg \ functional/delegate/delegate_common.kshlib \ functional/devices/devices.cfg \ functional/devices/devices_common.kshlib \ functional/events/events.cfg \ functional/events/events_common.kshlib \ functional/fault/fault.cfg \ functional/grow/grow.cfg \ functional/history/history.cfg \ functional/history/history_common.kshlib \ functional/history/i386.migratedpool.DAT.Z \ functional/history/i386.orig_history.txt \ functional/history/sparc.migratedpool.DAT.Z \ functional/history/sparc.orig_history.txt \ functional/history/zfs-pool-v4.dat.Z \ functional/inheritance/config001.cfg \ functional/inheritance/config002.cfg \ functional/inheritance/config003.cfg \ functional/inheritance/config004.cfg \ functional/inheritance/config005.cfg \ functional/inheritance/config006.cfg \ functional/inheritance/config007.cfg \ functional/inheritance/config008.cfg \ functional/inheritance/config009.cfg \ functional/inheritance/config010.cfg \ functional/inheritance/config011.cfg \ functional/inheritance/config012.cfg \ functional/inheritance/config013.cfg \ functional/inheritance/config014.cfg \ functional/inheritance/config015.cfg \ functional/inheritance/config016.cfg \ functional/inheritance/config017.cfg \ functional/inheritance/config018.cfg \ functional/inheritance/config019.cfg \ functional/inheritance/config020.cfg \ functional/inheritance/config021.cfg \ functional/inheritance/config022.cfg \ functional/inheritance/config023.cfg \ functional/inheritance/config024.cfg \ functional/inheritance/inherit.kshlib \ functional/inheritance/README.config \ functional/inheritance/README.state \ functional/inheritance/state001.cfg \ functional/inheritance/state002.cfg \ functional/inheritance/state003.cfg \ functional/inheritance/state004.cfg \ functional/inheritance/state005.cfg \ functional/inheritance/state006.cfg \ functional/inheritance/state007.cfg \ functional/inheritance/state008.cfg \ functional/inheritance/state009.cfg \ functional/inheritance/state010.cfg \ functional/inheritance/state011.cfg \ functional/inheritance/state012.cfg \ functional/inheritance/state013.cfg \ functional/inheritance/state014.cfg \ functional/inheritance/state015.cfg \ functional/inheritance/state016.cfg \ functional/inheritance/state017.cfg \ functional/inheritance/state018.cfg \ functional/inheritance/state019.cfg \ functional/inheritance/state020.cfg \ functional/inheritance/state021.cfg \ functional/inheritance/state022.cfg \ functional/inheritance/state023.cfg \ functional/inheritance/state024.cfg \ functional/inuse/inuse.cfg \ functional/io/io.cfg \ functional/l2arc/l2arc.cfg \ functional/largest_pool/largest_pool.cfg \ functional/migration/migration.cfg \ functional/migration/migration.kshlib \ functional/mmap/mmap.cfg \ functional/mmp/mmp.cfg \ functional/mmp/mmp.kshlib \ functional/mv_files/mv_files.cfg \ functional/mv_files/mv_files_common.kshlib \ functional/nopwrite/nopwrite.shlib \ functional/no_space/enospc.cfg \ functional/online_offline/online_offline.cfg \ functional/pool_checkpoint/pool_checkpoint.kshlib \ functional/projectquota/projectquota.cfg \ functional/projectquota/projectquota_common.kshlib \ functional/quota/quota.cfg \ functional/quota/quota.kshlib \ functional/redacted_send/redacted.cfg \ functional/redacted_send/redacted.kshlib \ functional/redundancy/redundancy.cfg \ functional/redundancy/redundancy.kshlib \ functional/refreserv/refreserv.cfg \ functional/removal/removal.kshlib \ functional/replacement/replacement.cfg \ functional/reservation/reservation.cfg \ functional/reservation/reservation.shlib \ functional/rsend/dedup_encrypted_zvol.bz2 \ functional/rsend/dedup_encrypted_zvol.zsend.bz2 \ functional/rsend/dedup.zsend.bz2 \ functional/rsend/fs.tar.gz \ functional/rsend/rsend.cfg \ functional/rsend/rsend.kshlib \ functional/scrub_mirror/default.cfg \ functional/scrub_mirror/scrub_mirror_common.kshlib \ functional/slog/slog.cfg \ functional/slog/slog.kshlib \ functional/snapshot/snapshot.cfg \ functional/snapused/snapused.kshlib \ functional/sparse/sparse.cfg \ functional/trim/trim.cfg \ functional/trim/trim.kshlib \ functional/truncate/truncate.cfg \ functional/upgrade/upgrade_common.kshlib \ functional/user_namespace/user_namespace.cfg \ functional/user_namespace/user_namespace_common.kshlib \ functional/userquota/13709_reproducer.bz2 \ functional/userquota/userquota.cfg \ functional/userquota/userquota_common.kshlib \ functional/vdev_zaps/vdev_zaps.kshlib \ functional/xattr/xattr.cfg \ functional/xattr/xattr_common.kshlib \ functional/zvol/zvol.cfg \ functional/zvol/zvol_cli/zvol_cli.cfg \ functional/zvol/zvol_common.shlib \ functional/zvol/zvol_ENOSPC/zvol_ENOSPC.cfg \ functional/zvol/zvol_misc/zvol_misc_common.kshlib \ functional/zvol/zvol_swap/zvol_swap.cfg \ functional/idmap_mount/idmap_mount.cfg \ functional/idmap_mount/idmap_mount_common.kshlib nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/acl/off/cleanup.ksh \ functional/acl/off/dosmode.ksh \ functional/acl/off/posixmode.ksh \ functional/acl/off/setup.ksh \ functional/acl/posix/cleanup.ksh \ functional/acl/posix/posix_001_pos.ksh \ functional/acl/posix/posix_002_pos.ksh \ functional/acl/posix/posix_003_pos.ksh \ functional/acl/posix/posix_004_pos.ksh \ functional/acl/posix-sa/cleanup.ksh \ functional/acl/posix-sa/posix_001_pos.ksh \ functional/acl/posix-sa/posix_002_pos.ksh \ functional/acl/posix-sa/posix_003_pos.ksh \ functional/acl/posix-sa/posix_004_pos.ksh \ functional/acl/posix-sa/setup.ksh \ functional/acl/posix/setup.ksh \ functional/alloc_class/alloc_class_001_pos.ksh \ functional/alloc_class/alloc_class_002_neg.ksh \ functional/alloc_class/alloc_class_003_pos.ksh \ functional/alloc_class/alloc_class_004_pos.ksh \ functional/alloc_class/alloc_class_005_pos.ksh \ functional/alloc_class/alloc_class_006_pos.ksh \ functional/alloc_class/alloc_class_007_pos.ksh \ functional/alloc_class/alloc_class_008_pos.ksh \ functional/alloc_class/alloc_class_009_pos.ksh \ functional/alloc_class/alloc_class_010_pos.ksh \ functional/alloc_class/alloc_class_011_neg.ksh \ functional/alloc_class/alloc_class_012_pos.ksh \ functional/alloc_class/alloc_class_013_pos.ksh \ functional/alloc_class/alloc_class_014_neg.ksh \ functional/alloc_class/alloc_class_015_pos.ksh \ functional/alloc_class/cleanup.ksh \ functional/alloc_class/setup.ksh \ functional/append/file_append.ksh \ functional/append/threadsappend_001_pos.ksh \ functional/append/cleanup.ksh \ functional/append/setup.ksh \ functional/arc/arcstats_runtime_tuning.ksh \ functional/arc/cleanup.ksh \ functional/arc/dbufstats_001_pos.ksh \ functional/arc/dbufstats_002_pos.ksh \ functional/arc/dbufstats_003_pos.ksh \ functional/arc/setup.ksh \ functional/atime/atime_001_pos.ksh \ functional/atime/atime_002_neg.ksh \ functional/atime/atime_003_pos.ksh \ functional/atime/cleanup.ksh \ functional/atime/root_atime_off.ksh \ functional/atime/root_atime_on.ksh \ functional/atime/root_relatime_on.ksh \ functional/atime/setup.ksh \ functional/block_cloning/cleanup.ksh \ functional/block_cloning/setup.ksh \ functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \ functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \ functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \ functional/block_cloning/block_cloning_copyfilerange.ksh \ functional/block_cloning/block_cloning_copyfilerange_partial.ksh \ functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \ functional/block_cloning/block_cloning_disabled_ficlone.ksh \ functional/block_cloning/block_cloning_disabled_ficlonerange.ksh \ functional/block_cloning/block_cloning_ficlone.ksh \ functional/block_cloning/block_cloning_ficlonerange.ksh \ functional/block_cloning/block_cloning_ficlonerange_partial.ksh \ functional/bootfs/bootfs_001_pos.ksh \ functional/bootfs/bootfs_002_neg.ksh \ functional/bootfs/bootfs_003_pos.ksh \ functional/bootfs/bootfs_004_neg.ksh \ functional/bootfs/bootfs_005_neg.ksh \ functional/bootfs/bootfs_006_pos.ksh \ functional/bootfs/bootfs_007_pos.ksh \ functional/bootfs/bootfs_008_pos.ksh \ functional/bootfs/cleanup.ksh \ functional/bootfs/setup.ksh \ functional/btree/btree_negative.ksh \ functional/btree/btree_positive.ksh \ functional/cache/cache_001_pos.ksh \ functional/cache/cache_002_pos.ksh \ functional/cache/cache_003_pos.ksh \ functional/cache/cache_004_neg.ksh \ functional/cache/cache_005_neg.ksh \ functional/cache/cache_006_pos.ksh \ functional/cache/cache_007_neg.ksh \ functional/cache/cache_008_neg.ksh \ functional/cache/cache_009_pos.ksh \ functional/cache/cache_010_pos.ksh \ functional/cache/cache_011_pos.ksh \ functional/cache/cache_012_pos.ksh \ functional/cache/cleanup.ksh \ functional/cachefile/cachefile_001_pos.ksh \ functional/cachefile/cachefile_002_pos.ksh \ functional/cachefile/cachefile_003_pos.ksh \ functional/cachefile/cachefile_004_pos.ksh \ functional/cachefile/cleanup.ksh \ functional/cachefile/setup.ksh \ functional/cache/setup.ksh \ functional/casenorm/case_all_values.ksh \ functional/casenorm/cleanup.ksh \ functional/casenorm/insensitive_formd_delete.ksh \ functional/casenorm/insensitive_formd_lookup.ksh \ functional/casenorm/insensitive_none_delete.ksh \ functional/casenorm/insensitive_none_lookup.ksh \ functional/casenorm/mixed_create_failure.ksh \ functional/casenorm/mixed_formd_delete.ksh \ functional/casenorm/mixed_formd_lookup_ci.ksh \ functional/casenorm/mixed_formd_lookup.ksh \ functional/casenorm/mixed_none_delete.ksh \ functional/casenorm/mixed_none_lookup_ci.ksh \ functional/casenorm/mixed_none_lookup.ksh \ functional/casenorm/norm_all_values.ksh \ functional/casenorm/sensitive_formd_delete.ksh \ functional/casenorm/sensitive_formd_lookup.ksh \ functional/casenorm/sensitive_none_delete.ksh \ functional/casenorm/sensitive_none_lookup.ksh \ functional/casenorm/setup.ksh \ functional/channel_program/lua_core/cleanup.ksh \ functional/channel_program/lua_core/setup.ksh \ functional/channel_program/lua_core/tst.args_to_lua.ksh \ functional/channel_program/lua_core/tst.divide_by_zero.ksh \ functional/channel_program/lua_core/tst.exists.ksh \ functional/channel_program/lua_core/tst.integer_illegal.ksh \ functional/channel_program/lua_core/tst.integer_overflow.ksh \ functional/channel_program/lua_core/tst.language_functions_neg.ksh \ functional/channel_program/lua_core/tst.language_functions_pos.ksh \ functional/channel_program/lua_core/tst.large_prog.ksh \ functional/channel_program/lua_core/tst.libraries.ksh \ functional/channel_program/lua_core/tst.memory_limit.ksh \ functional/channel_program/lua_core/tst.nested_neg.ksh \ functional/channel_program/lua_core/tst.nested_pos.ksh \ functional/channel_program/lua_core/tst.nvlist_to_lua.ksh \ functional/channel_program/lua_core/tst.recursive_neg.ksh \ functional/channel_program/lua_core/tst.recursive_pos.ksh \ functional/channel_program/lua_core/tst.return_large.ksh \ functional/channel_program/lua_core/tst.return_nvlist_neg.ksh \ functional/channel_program/lua_core/tst.return_nvlist_pos.ksh \ functional/channel_program/lua_core/tst.return_recursive_table.ksh \ functional/channel_program/lua_core/tst.stack_gsub.ksh \ functional/channel_program/lua_core/tst.timeout.ksh \ functional/channel_program/synctask_core/cleanup.ksh \ functional/channel_program/synctask_core/setup.ksh \ functional/channel_program/synctask_core/tst.bookmark.copy.ksh \ functional/channel_program/synctask_core/tst.bookmark.create.ksh \ functional/channel_program/synctask_core/tst.destroy_fs.ksh \ functional/channel_program/synctask_core/tst.destroy_snap.ksh \ functional/channel_program/synctask_core/tst.get_count_and_limit.ksh \ functional/channel_program/synctask_core/tst.get_index_props.ksh \ functional/channel_program/synctask_core/tst.get_mountpoint.ksh \ functional/channel_program/synctask_core/tst.get_neg.ksh \ functional/channel_program/synctask_core/tst.get_number_props.ksh \ functional/channel_program/synctask_core/tst.get_string_props.ksh \ functional/channel_program/synctask_core/tst.get_type.ksh \ functional/channel_program/synctask_core/tst.get_userquota.ksh \ functional/channel_program/synctask_core/tst.get_written.ksh \ functional/channel_program/synctask_core/tst.inherit.ksh \ functional/channel_program/synctask_core/tst.list_bookmarks.ksh \ functional/channel_program/synctask_core/tst.list_children.ksh \ functional/channel_program/synctask_core/tst.list_clones.ksh \ functional/channel_program/synctask_core/tst.list_holds.ksh \ functional/channel_program/synctask_core/tst.list_snapshots.ksh \ functional/channel_program/synctask_core/tst.list_system_props.ksh \ functional/channel_program/synctask_core/tst.list_user_props.ksh \ functional/channel_program/synctask_core/tst.parse_args_neg.ksh \ functional/channel_program/synctask_core/tst.promote_conflict.ksh \ functional/channel_program/synctask_core/tst.promote_multiple.ksh \ functional/channel_program/synctask_core/tst.promote_simple.ksh \ functional/channel_program/synctask_core/tst.rollback_mult.ksh \ functional/channel_program/synctask_core/tst.rollback_one.ksh \ functional/channel_program/synctask_core/tst.set_props.ksh \ functional/channel_program/synctask_core/tst.snapshot_destroy.ksh \ functional/channel_program/synctask_core/tst.snapshot_neg.ksh \ functional/channel_program/synctask_core/tst.snapshot_recursive.ksh \ functional/channel_program/synctask_core/tst.snapshot_rename.ksh \ functional/channel_program/synctask_core/tst.snapshot_simple.ksh \ functional/channel_program/synctask_core/tst.terminate_by_signal.ksh \ functional/chattr/chattr_001_pos.ksh \ functional/chattr/chattr_002_neg.ksh \ functional/chattr/cleanup.ksh \ functional/chattr/setup.ksh \ functional/checksum/cleanup.ksh \ functional/checksum/filetest_001_pos.ksh \ functional/checksum/filetest_002_pos.ksh \ functional/checksum/run_blake3_test.ksh \ functional/checksum/run_edonr_test.ksh \ functional/checksum/run_sha2_test.ksh \ functional/checksum/run_skein_test.ksh \ functional/checksum/setup.ksh \ functional/clean_mirror/clean_mirror_001_pos.ksh \ functional/clean_mirror/clean_mirror_002_pos.ksh \ functional/clean_mirror/clean_mirror_003_pos.ksh \ functional/clean_mirror/clean_mirror_004_pos.ksh \ functional/clean_mirror/cleanup.ksh \ functional/clean_mirror/setup.ksh \ functional/cli_root/zdb/zdb_002_pos.ksh \ functional/cli_root/zdb/zdb_003_pos.ksh \ functional/cli_root/zdb/zdb_004_pos.ksh \ functional/cli_root/zdb/zdb_005_pos.ksh \ functional/cli_root/zdb/zdb_006_pos.ksh \ functional/cli_root/zdb/zdb_args_neg.ksh \ functional/cli_root/zdb/zdb_args_pos.ksh \ functional/cli_root/zdb/zdb_backup.ksh \ functional/cli_root/zdb/zdb_block_size_histogram.ksh \ functional/cli_root/zdb/zdb_checksum.ksh \ functional/cli_root/zdb/zdb_decompress.ksh \ functional/cli_root/zdb/zdb_decompress_zstd.ksh \ functional/cli_root/zdb/zdb_display_block.ksh \ functional/cli_root/zdb/zdb_encrypted.ksh \ functional/cli_root/zdb/zdb_label_checksum.ksh \ functional/cli_root/zdb/zdb_object_range_neg.ksh \ functional/cli_root/zdb/zdb_object_range_pos.ksh \ functional/cli_root/zdb/zdb_objset_id.ksh \ functional/cli_root/zdb/zdb_recover_2.ksh \ functional/cli_root/zdb/zdb_recover.ksh \ functional/cli_root/zfs_bookmark/cleanup.ksh \ functional/cli_root/zfs_bookmark/setup.ksh \ functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh \ functional/cli_root/zfs_change-key/cleanup.ksh \ functional/cli_root/zfs_change-key/setup.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_child.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_clones.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_format.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_inherit.ksh \ functional/cli_root/zfs_change-key/zfs_change-key.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_load.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_location.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_pbkdf2iters.ksh \ functional/cli_root/zfs/cleanup.ksh \ functional/cli_root/zfs_clone/cleanup.ksh \ functional/cli_root/zfs_clone/setup.ksh \ functional/cli_root/zfs_clone/zfs_clone_001_neg.ksh \ functional/cli_root/zfs_clone/zfs_clone_002_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_003_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_004_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_005_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_006_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_007_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_008_neg.ksh \ functional/cli_root/zfs_clone/zfs_clone_009_neg.ksh \ functional/cli_root/zfs_clone/zfs_clone_010_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_deeply_nested.ksh \ functional/cli_root/zfs_clone/zfs_clone_encrypted.ksh \ functional/cli_root/zfs_clone/zfs_clone_rm_nested.ksh \ functional/cli_root/zfs_copies/cleanup.ksh \ functional/cli_root/zfs_copies/setup.ksh \ functional/cli_root/zfs_copies/zfs_copies_001_pos.ksh \ functional/cli_root/zfs_copies/zfs_copies_002_pos.ksh \ functional/cli_root/zfs_copies/zfs_copies_003_pos.ksh \ functional/cli_root/zfs_copies/zfs_copies_004_neg.ksh \ functional/cli_root/zfs_copies/zfs_copies_005_neg.ksh \ functional/cli_root/zfs_copies/zfs_copies_006_pos.ksh \ functional/cli_root/zfs_create/cleanup.ksh \ functional/cli_root/zfs_create/setup.ksh \ functional/cli_root/zfs_create/zfs_create_001_pos.ksh \ functional/cli_root/zfs_create/zfs_create_002_pos.ksh \ functional/cli_root/zfs_create/zfs_create_003_pos.ksh \ functional/cli_root/zfs_create/zfs_create_004_pos.ksh \ functional/cli_root/zfs_create/zfs_create_005_pos.ksh \ functional/cli_root/zfs_create/zfs_create_006_pos.ksh \ functional/cli_root/zfs_create/zfs_create_007_pos.ksh \ functional/cli_root/zfs_create/zfs_create_008_neg.ksh \ functional/cli_root/zfs_create/zfs_create_009_neg.ksh \ functional/cli_root/zfs_create/zfs_create_010_neg.ksh \ functional/cli_root/zfs_create/zfs_create_011_pos.ksh \ functional/cli_root/zfs_create/zfs_create_012_pos.ksh \ functional/cli_root/zfs_create/zfs_create_013_pos.ksh \ functional/cli_root/zfs_create/zfs_create_014_pos.ksh \ functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh \ functional/cli_root/zfs_create/zfs_create_dryrun.ksh \ functional/cli_root/zfs_create/zfs_create_encrypted.ksh \ functional/cli_root/zfs_create/zfs_create_nomount.ksh \ functional/cli_root/zfs_create/zfs_create_verbose.ksh \ functional/cli_root/zfs_destroy/cleanup.ksh \ functional/cli_root/zfs_destroy/setup.ksh \ functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_and_disable.ksh \ functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_races.ksh \ functional/cli_root/zfs_destroy/zfs_clone_livelist_dedup.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_001_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_002_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_003_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_004_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_005_neg.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_006_neg.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_007_neg.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_008_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_009_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_010_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_011_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_012_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_013_neg.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_014_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_015_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_016_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_clone_livelist.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_dev_removal_condense.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_dev_removal.ksh \ functional/cli_root/zfs_diff/cleanup.ksh \ functional/cli_root/zfs_diff/setup.ksh \ functional/cli_root/zfs_diff/zfs_diff_changes.ksh \ functional/cli_root/zfs_diff/zfs_diff_cliargs.ksh \ functional/cli_root/zfs_diff/zfs_diff_encrypted.ksh \ functional/cli_root/zfs_diff/zfs_diff_mangle.ksh \ functional/cli_root/zfs_diff/zfs_diff_timestamp.ksh \ functional/cli_root/zfs_diff/zfs_diff_types.ksh \ functional/cli_root/zfs_get/cleanup.ksh \ functional/cli_root/zfs_get/setup.ksh \ functional/cli_root/zfs_get/zfs_get_001_pos.ksh \ functional/cli_root/zfs_get/zfs_get_002_pos.ksh \ functional/cli_root/zfs_get/zfs_get_003_pos.ksh \ functional/cli_root/zfs_get/zfs_get_004_pos.ksh \ functional/cli_root/zfs_get/zfs_get_005_neg.ksh \ functional/cli_root/zfs_get/zfs_get_006_neg.ksh \ functional/cli_root/zfs_get/zfs_get_007_neg.ksh \ functional/cli_root/zfs_get/zfs_get_008_pos.ksh \ functional/cli_root/zfs_get/zfs_get_009_pos.ksh \ functional/cli_root/zfs_get/zfs_get_010_neg.ksh \ functional/cli_root/zfs_ids_to_path/cleanup.ksh \ functional/cli_root/zfs_ids_to_path/setup.ksh \ functional/cli_root/zfs_ids_to_path/zfs_ids_to_path_001_pos.ksh \ functional/cli_root/zfs_inherit/cleanup.ksh \ functional/cli_root/zfs_inherit/setup.ksh \ functional/cli_root/zfs_inherit/zfs_inherit_001_neg.ksh \ functional/cli_root/zfs_inherit/zfs_inherit_002_neg.ksh \ functional/cli_root/zfs_inherit/zfs_inherit_003_pos.ksh \ functional/cli_root/zfs_inherit/zfs_inherit_mountpoint.ksh \ functional/cli_root/zfs_jail/cleanup.ksh \ functional/cli_root/zfs_jail/setup.ksh \ functional/cli_root/zfs_jail/zfs_jail_001_pos.ksh \ functional/cli_root/zfs_load-key/cleanup.ksh \ functional/cli_root/zfs_load-key/setup.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_all.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_file.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_https.ksh \ functional/cli_root/zfs_load-key/zfs_load-key.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_location.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_noop.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_recursive.ksh \ functional/cli_root/zfs_mount/cleanup.ksh \ functional/cli_root/zfs_mount/setup.ksh \ functional/cli_root/zfs_mount/zfs_mount_001_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_002_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_003_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_004_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_005_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_007_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_009_neg.ksh \ functional/cli_root/zfs_mount/zfs_mount_010_neg.ksh \ functional/cli_root/zfs_mount/zfs_mount_011_neg.ksh \ functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_013_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_014_neg.ksh \ functional/cli_root/zfs_mount/zfs_mount_all_001_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh \ functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh \ functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh \ functional/cli_root/zfs_mount/zfs_mount_remount.ksh \ functional/cli_root/zfs_mount/zfs_mount_test_race.ksh \ functional/cli_root/zfs_mount/zfs_multi_mount.ksh \ functional/cli_root/zfs_program/cleanup.ksh \ functional/cli_root/zfs_program/setup.ksh \ functional/cli_root/zfs_program/zfs_program_json.ksh \ functional/cli_root/zfs_promote/cleanup.ksh \ functional/cli_root/zfs_promote/setup.ksh \ functional/cli_root/zfs_promote/zfs_promote_001_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_002_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_003_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_004_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_005_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_006_neg.ksh \ functional/cli_root/zfs_promote/zfs_promote_007_neg.ksh \ functional/cli_root/zfs_promote/zfs_promote_008_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_encryptionroot.ksh \ functional/cli_root/zfs_property/cleanup.ksh \ functional/cli_root/zfs_property/setup.ksh \ functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh \ functional/cli_root/zfs_receive/cleanup.ksh \ functional/cli_root/zfs_receive/receive-o-x_props_aliases.ksh \ functional/cli_root/zfs_receive/receive-o-x_props_override.ksh \ functional/cli_root/zfs_receive/setup.ksh \ functional/cli_root/zfs_receive/zfs_receive_001_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_002_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_003_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_004_neg.ksh \ functional/cli_root/zfs_receive/zfs_receive_005_neg.ksh \ functional/cli_root/zfs_receive/zfs_receive_006_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_007_neg.ksh \ functional/cli_root/zfs_receive/zfs_receive_008_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_009_neg.ksh \ functional/cli_root/zfs_receive/zfs_receive_010_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_011_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_012_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_013_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_014_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_015_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_016_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_-e.ksh \ functional/cli_root/zfs_receive/zfs_receive_from_encrypted.ksh \ functional/cli_root/zfs_receive/zfs_receive_from_zstd.ksh \ functional/cli_root/zfs_receive/zfs_receive_new_props.ksh \ functional/cli_root/zfs_receive/zfs_receive_raw_-d.ksh \ functional/cli_root/zfs_receive/zfs_receive_raw_incremental.ksh \ functional/cli_root/zfs_receive/zfs_receive_raw.ksh \ functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh \ functional/cli_root/zfs_receive/zfs_receive_-wR-encrypted-mix.ksh \ functional/cli_root/zfs_receive/zfs_receive_corrective.ksh \ functional/cli_root/zfs_receive/zfs_receive_compressed_corrective.ksh \ functional/cli_root/zfs_receive/zfs_receive_large_block_corrective.ksh \ functional/cli_root/zfs_rename/cleanup.ksh \ functional/cli_root/zfs_rename/setup.ksh \ functional/cli_root/zfs_rename/zfs_rename_001_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_002_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_003_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_004_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_005_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_006_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_007_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_008_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_009_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_010_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_011_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_012_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_013_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_014_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_encrypted_child.ksh \ functional/cli_root/zfs_rename/zfs_rename_mountpoint.ksh \ functional/cli_root/zfs_rename/zfs_rename_nounmount.ksh \ functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh \ functional/cli_root/zfs_reservation/cleanup.ksh \ functional/cli_root/zfs_reservation/setup.ksh \ functional/cli_root/zfs_reservation/zfs_reservation_001_pos.ksh \ functional/cli_root/zfs_reservation/zfs_reservation_002_pos.ksh \ functional/cli_root/zfs_rollback/cleanup.ksh \ functional/cli_root/zfs_rollback/setup.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_002_pos.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_003_neg.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_004_neg.ksh \ functional/cli_root/zfs_send/cleanup.ksh \ functional/cli_root/zfs_send/setup.ksh \ functional/cli_root/zfs_send/zfs_send_001_pos.ksh \ functional/cli_root/zfs_send/zfs_send_002_pos.ksh \ functional/cli_root/zfs_send/zfs_send_003_pos.ksh \ functional/cli_root/zfs_send/zfs_send_004_neg.ksh \ functional/cli_root/zfs_send/zfs_send_005_pos.ksh \ functional/cli_root/zfs_send/zfs_send_006_pos.ksh \ functional/cli_root/zfs_send/zfs_send_007_pos.ksh \ functional/cli_root/zfs_send/zfs_send-b.ksh \ functional/cli_root/zfs_send/zfs_send_encrypted.ksh \ functional/cli_root/zfs_send/zfs_send_encrypted_unloaded.ksh \ functional/cli_root/zfs_send/zfs_send_raw.ksh \ functional/cli_root/zfs_send/zfs_send_skip_missing.ksh \ functional/cli_root/zfs_send/zfs_send_sparse.ksh \ functional/cli_root/zfs_set/cache_001_pos.ksh \ functional/cli_root/zfs_set/cache_002_neg.ksh \ functional/cli_root/zfs_set/canmount_001_pos.ksh \ functional/cli_root/zfs_set/canmount_002_pos.ksh \ functional/cli_root/zfs_set/canmount_003_pos.ksh \ functional/cli_root/zfs_set/canmount_004_pos.ksh \ functional/cli_root/zfs_set/checksum_001_pos.ksh \ functional/cli_root/zfs_set/cleanup.ksh \ functional/cli_root/zfs_set/compression_001_pos.ksh \ functional/cli_root/zfs_set/mountpoint_001_pos.ksh \ functional/cli_root/zfs_set/mountpoint_002_pos.ksh \ functional/cli_root/zfs_set/mountpoint_003_pos.ksh \ functional/cli_root/zfs_set/onoffs_001_pos.ksh \ functional/cli_root/zfs_set/property_alias_001_pos.ksh \ functional/cli_root/zfs_set/readonly_001_pos.ksh \ functional/cli_root/zfs_set/reservation_001_neg.ksh \ functional/cli_root/zfs_set/ro_props_001_pos.ksh \ functional/cli_root/zfs_set/setup.ksh \ functional/cli_root/zfs_set/share_mount_001_neg.ksh \ functional/cli_root/zfs_set/snapdir_001_pos.ksh \ functional/cli_root/zfs/setup.ksh \ functional/cli_root/zfs_set/user_property_001_pos.ksh \ functional/cli_root/zfs_set/user_property_002_pos.ksh \ functional/cli_root/zfs_set/user_property_003_neg.ksh \ functional/cli_root/zfs_set/user_property_004_pos.ksh \ functional/cli_root/zfs_set/version_001_neg.ksh \ functional/cli_root/zfs_set/zfs_set_001_neg.ksh \ functional/cli_root/zfs_set/zfs_set_002_neg.ksh \ functional/cli_root/zfs_set/zfs_set_003_neg.ksh \ functional/cli_root/zfs_set/zfs_set_feature_activation.ksh \ functional/cli_root/zfs_set/zfs_set_keylocation.ksh \ functional/cli_root/zfs_set/zfs_set_nomount.ksh \ functional/cli_root/zfs_share/cleanup.ksh \ functional/cli_root/zfs_share/setup.ksh \ functional/cli_root/zfs_share/zfs_share_001_pos.ksh \ functional/cli_root/zfs_share/zfs_share_002_pos.ksh \ functional/cli_root/zfs_share/zfs_share_003_pos.ksh \ functional/cli_root/zfs_share/zfs_share_004_pos.ksh \ functional/cli_root/zfs_share/zfs_share_005_pos.ksh \ functional/cli_root/zfs_share/zfs_share_006_pos.ksh \ functional/cli_root/zfs_share/zfs_share_007_neg.ksh \ functional/cli_root/zfs_share/zfs_share_008_neg.ksh \ functional/cli_root/zfs_share/zfs_share_009_neg.ksh \ functional/cli_root/zfs_share/zfs_share_010_neg.ksh \ functional/cli_root/zfs_share/zfs_share_011_pos.ksh \ functional/cli_root/zfs_share/zfs_share_012_pos.ksh \ functional/cli_root/zfs_share/zfs_share_013_pos.ksh \ functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh \ functional/cli_root/zfs_snapshot/cleanup.ksh \ functional/cli_root/zfs_snapshot/setup.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_001_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_002_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_003_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_004_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_005_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_006_pos.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_007_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh \ functional/cli_root/zfs_sysfs/cleanup.ksh \ functional/cli_root/zfs_sysfs/setup.ksh \ functional/cli_root/zfs_sysfs/zfeature_set_unsupported.ksh \ functional/cli_root/zfs_sysfs/zfs_get_unsupported.ksh \ functional/cli_root/zfs_sysfs/zfs_set_unsupported.ksh \ functional/cli_root/zfs_sysfs/zfs_sysfs_live.ksh \ functional/cli_root/zfs_sysfs/zpool_get_unsupported.ksh \ functional/cli_root/zfs_sysfs/zpool_set_unsupported.ksh \ functional/cli_root/zfs_unload-key/cleanup.ksh \ functional/cli_root/zfs_unload-key/setup.ksh \ functional/cli_root/zfs_unload-key/zfs_unload-key_all.ksh \ functional/cli_root/zfs_unload-key/zfs_unload-key.ksh \ functional/cli_root/zfs_unload-key/zfs_unload-key_recursive.ksh \ functional/cli_root/zfs_unmount/cleanup.ksh \ functional/cli_root/zfs_unmount/setup.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_001_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_002_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_003_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_004_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_005_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_006_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_007_neg.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_008_neg.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_009_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_all_001_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_nested.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_unload_keys.ksh \ functional/cli_root/zfs_unshare/cleanup.ksh \ functional/cli_root/zfs_unshare/setup.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_001_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_002_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_003_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_004_neg.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_005_neg.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_006_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_007_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_008_pos.ksh \ functional/cli_root/zfs_upgrade/cleanup.ksh \ functional/cli_root/zfs_upgrade/setup.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_001_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_002_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_003_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_004_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_005_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_006_neg.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_007_neg.ksh \ functional/cli_root/zfs_wait/cleanup.ksh \ functional/cli_root/zfs_wait/setup.ksh \ functional/cli_root/zfs_wait/zfs_wait_deleteq.ksh \ functional/cli_root/zfs_wait/zfs_wait_getsubopt.ksh \ functional/cli_root/zfs/zfs_001_neg.ksh \ functional/cli_root/zfs/zfs_002_pos.ksh \ functional/cli_root/zfs/zfs_003_neg.ksh \ functional/cli_root/zhack/zhack_label_repair_001.ksh \ functional/cli_root/zhack/zhack_label_repair_002.ksh \ functional/cli_root/zhack/zhack_label_repair_003.ksh \ functional/cli_root/zhack/zhack_label_repair_004.ksh \ functional/cli_root/zpool_add/add_nested_replacing_spare.ksh \ functional/cli_root/zpool_add/add-o_ashift.ksh \ functional/cli_root/zpool_add/add_prop_ashift.ksh \ functional/cli_root/zpool_add/cleanup.ksh \ functional/cli_root/zpool_add/setup.ksh \ functional/cli_root/zpool_add/zpool_add_001_pos.ksh \ functional/cli_root/zpool_add/zpool_add_002_pos.ksh \ functional/cli_root/zpool_add/zpool_add_003_pos.ksh \ functional/cli_root/zpool_add/zpool_add_004_pos.ksh \ functional/cli_root/zpool_add/zpool_add_005_pos.ksh \ functional/cli_root/zpool_add/zpool_add_006_pos.ksh \ functional/cli_root/zpool_add/zpool_add_007_neg.ksh \ functional/cli_root/zpool_add/zpool_add_008_neg.ksh \ functional/cli_root/zpool_add/zpool_add_009_neg.ksh \ functional/cli_root/zpool_add/zpool_add_010_pos.ksh \ functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh \ functional/cli_root/zpool_attach/attach-o_ashift.ksh \ functional/cli_root/zpool_attach/cleanup.ksh \ functional/cli_root/zpool_attach/setup.ksh \ functional/cli_root/zpool_attach/zpool_attach_001_neg.ksh \ functional/cli_root/zpool/cleanup.ksh \ functional/cli_root/zpool_clear/cleanup.ksh \ functional/cli_root/zpool_clear/setup.ksh \ functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh \ functional/cli_root/zpool_clear/zpool_clear_002_neg.ksh \ functional/cli_root/zpool_clear/zpool_clear_003_neg.ksh \ functional/cli_root/zpool_clear/zpool_clear_readonly.ksh \ functional/cli_root/zpool_create/cleanup.ksh \ functional/cli_root/zpool_create/create-o_ashift.ksh \ functional/cli_root/zpool_create/setup.ksh \ functional/cli_root/zpool_create/zpool_create_001_pos.ksh \ functional/cli_root/zpool_create/zpool_create_002_pos.ksh \ functional/cli_root/zpool_create/zpool_create_003_pos.ksh \ functional/cli_root/zpool_create/zpool_create_004_pos.ksh \ functional/cli_root/zpool_create/zpool_create_005_pos.ksh \ functional/cli_root/zpool_create/zpool_create_006_pos.ksh \ functional/cli_root/zpool_create/zpool_create_007_neg.ksh \ functional/cli_root/zpool_create/zpool_create_008_pos.ksh \ functional/cli_root/zpool_create/zpool_create_009_neg.ksh \ functional/cli_root/zpool_create/zpool_create_010_neg.ksh \ functional/cli_root/zpool_create/zpool_create_011_neg.ksh \ functional/cli_root/zpool_create/zpool_create_012_neg.ksh \ functional/cli_root/zpool_create/zpool_create_014_neg.ksh \ functional/cli_root/zpool_create/zpool_create_015_neg.ksh \ functional/cli_root/zpool_create/zpool_create_016_pos.ksh \ functional/cli_root/zpool_create/zpool_create_017_neg.ksh \ functional/cli_root/zpool_create/zpool_create_018_pos.ksh \ functional/cli_root/zpool_create/zpool_create_019_pos.ksh \ functional/cli_root/zpool_create/zpool_create_020_pos.ksh \ functional/cli_root/zpool_create/zpool_create_021_pos.ksh \ functional/cli_root/zpool_create/zpool_create_022_pos.ksh \ functional/cli_root/zpool_create/zpool_create_023_neg.ksh \ functional/cli_root/zpool_create/zpool_create_024_pos.ksh \ functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh \ functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh \ functional/cli_root/zpool_create/zpool_create_encrypted.ksh \ functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_002_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_003_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_004_neg.ksh \ functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_006_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_008_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_009_pos.ksh \ functional/cli_root/zpool_create/zpool_create_tempname.ksh \ functional/cli_root/zpool_destroy/zpool_destroy_001_pos.ksh \ functional/cli_root/zpool_destroy/zpool_destroy_002_pos.ksh \ functional/cli_root/zpool_destroy/zpool_destroy_003_neg.ksh \ functional/cli_root/zpool_detach/cleanup.ksh \ functional/cli_root/zpool_detach/setup.ksh \ functional/cli_root/zpool_detach/zpool_detach_001_neg.ksh \ functional/cli_root/zpool_events/cleanup.ksh \ functional/cli_root/zpool_events/setup.ksh \ functional/cli_root/zpool_events/zpool_events_clear.ksh \ functional/cli_root/zpool_events/zpool_events_clear_retained.ksh \ functional/cli_root/zpool_events/zpool_events_cliargs.ksh \ functional/cli_root/zpool_events/zpool_events_duplicates.ksh \ functional/cli_root/zpool_events/zpool_events_errors.ksh \ functional/cli_root/zpool_events/zpool_events_follow.ksh \ functional/cli_root/zpool_events/zpool_events_poolname.ksh \ functional/cli_root/zpool_expand/cleanup.ksh \ functional/cli_root/zpool_expand/setup.ksh \ functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh \ functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh \ functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh \ functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh \ functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh \ functional/cli_root/zpool_export/cleanup.ksh \ functional/cli_root/zpool_export/setup.ksh \ functional/cli_root/zpool_export/zpool_export_001_pos.ksh \ functional/cli_root/zpool_export/zpool_export_002_pos.ksh \ functional/cli_root/zpool_export/zpool_export_003_neg.ksh \ functional/cli_root/zpool_export/zpool_export_004_pos.ksh \ functional/cli_root/zpool_get/cleanup.ksh \ functional/cli_root/zpool_get/setup.ksh \ functional/cli_root/zpool_get/vdev_get_001_pos.ksh \ functional/cli_root/zpool_get/zpool_get_001_pos.ksh \ functional/cli_root/zpool_get/zpool_get_002_pos.ksh \ functional/cli_root/zpool_get/zpool_get_003_pos.ksh \ functional/cli_root/zpool_get/zpool_get_004_neg.ksh \ functional/cli_root/zpool_get/zpool_get_005_pos.ksh \ functional/cli_root/zpool_history/cleanup.ksh \ functional/cli_root/zpool_history/setup.ksh \ functional/cli_root/zpool_history/zpool_history_001_neg.ksh \ functional/cli_root/zpool_history/zpool_history_002_pos.ksh \ functional/cli_root/zpool_import/cleanup.ksh \ functional/cli_root/zpool_import/import_cachefile_device_added.ksh \ functional/cli_root/zpool_import/import_cachefile_device_removed.ksh \ functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh \ functional/cli_root/zpool_import/import_cachefile_mirror_attached.ksh \ functional/cli_root/zpool_import/import_cachefile_mirror_detached.ksh \ functional/cli_root/zpool_import/import_cachefile_paths_changed.ksh \ functional/cli_root/zpool_import/import_cachefile_shared_device.ksh \ functional/cli_root/zpool_import/import_devices_missing.ksh \ functional/cli_root/zpool_import/import_log_missing.ksh \ functional/cli_root/zpool_import/import_paths_changed.ksh \ functional/cli_root/zpool_import/import_rewind_config_changed.ksh \ functional/cli_root/zpool_import/import_rewind_device_replaced.ksh \ functional/cli_root/zpool_import/setup.ksh \ functional/cli_root/zpool_import/zpool_import_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_002_pos.ksh \ functional/cli_root/zpool_import/zpool_import_003_pos.ksh \ functional/cli_root/zpool_import/zpool_import_004_pos.ksh \ functional/cli_root/zpool_import/zpool_import_005_pos.ksh \ functional/cli_root/zpool_import/zpool_import_006_pos.ksh \ functional/cli_root/zpool_import/zpool_import_007_pos.ksh \ functional/cli_root/zpool_import/zpool_import_008_pos.ksh \ functional/cli_root/zpool_import/zpool_import_009_neg.ksh \ functional/cli_root/zpool_import/zpool_import_010_pos.ksh \ functional/cli_root/zpool_import/zpool_import_011_neg.ksh \ functional/cli_root/zpool_import/zpool_import_012_pos.ksh \ functional/cli_root/zpool_import/zpool_import_013_neg.ksh \ functional/cli_root/zpool_import/zpool_import_014_pos.ksh \ functional/cli_root/zpool_import/zpool_import_015_pos.ksh \ functional/cli_root/zpool_import/zpool_import_016_pos.ksh \ functional/cli_root/zpool_import/zpool_import_017_pos.ksh \ functional/cli_root/zpool_import/zpool_import_all_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_encrypted.ksh \ functional/cli_root/zpool_import/zpool_import_encrypted_load.ksh \ functional/cli_root/zpool_import/zpool_import_errata3.ksh \ functional/cli_root/zpool_import/zpool_import_errata4.ksh \ functional/cli_root/zpool_import/zpool_import_features_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_features_002_neg.ksh \ functional/cli_root/zpool_import/zpool_import_features_003_pos.ksh \ + functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh \ + functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh \ + functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh \ + functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh \ functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh \ functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh \ functional/cli_root/zpool_import/zpool_import_rename_001_pos.ksh \ functional/cli_root/zpool_initialize/cleanup.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_split.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh \ functional/cli_root/zpool_labelclear/zpool_labelclear_active.ksh \ functional/cli_root/zpool_labelclear/zpool_labelclear_exported.ksh \ functional/cli_root/zpool_labelclear/zpool_labelclear_removed.ksh \ functional/cli_root/zpool_labelclear/zpool_labelclear_valid.ksh \ functional/cli_root/zpool_offline/cleanup.ksh \ functional/cli_root/zpool_offline/setup.ksh \ functional/cli_root/zpool_offline/zpool_offline_001_pos.ksh \ functional/cli_root/zpool_offline/zpool_offline_002_neg.ksh \ functional/cli_root/zpool_offline/zpool_offline_003_pos.ksh \ functional/cli_root/zpool_online/cleanup.ksh \ functional/cli_root/zpool_online/setup.ksh \ functional/cli_root/zpool_online/zpool_online_001_pos.ksh \ functional/cli_root/zpool_online/zpool_online_002_neg.ksh \ functional/cli_root/zpool_remove/cleanup.ksh \ functional/cli_root/zpool_remove/setup.ksh \ functional/cli_root/zpool_remove/zpool_remove_001_neg.ksh \ functional/cli_root/zpool_remove/zpool_remove_002_pos.ksh \ functional/cli_root/zpool_remove/zpool_remove_003_pos.ksh \ functional/cli_root/zpool_reopen/cleanup.ksh \ functional/cli_root/zpool_reopen/setup.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_001_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_002_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_003_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_004_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_005_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_006_neg.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_007_pos.ksh \ functional/cli_root/zpool_replace/cleanup.ksh \ functional/cli_root/zpool_replace/replace-o_ashift.ksh \ functional/cli_root/zpool_replace/replace_prop_ashift.ksh \ functional/cli_root/zpool_replace/setup.ksh \ functional/cli_root/zpool_replace/zpool_replace_001_neg.ksh \ functional/cli_root/zpool_resilver/cleanup.ksh \ functional/cli_root/zpool_resilver/setup.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_bad_args.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh \ functional/cli_root/zpool_scrub/cleanup.ksh \ functional/cli_root/zpool_scrub/setup.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_encrypted_unloaded.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh \ functional/cli_root/zpool_set/cleanup.ksh \ functional/cli_root/zpool_set/setup.ksh \ functional/cli_root/zpool/setup.ksh \ functional/cli_root/zpool_set/vdev_set_001_pos.ksh \ functional/cli_root/zpool_set/zpool_set_common.kshlib \ functional/cli_root/zpool_set/zpool_set_001_pos.ksh \ functional/cli_root/zpool_set/zpool_set_002_neg.ksh \ functional/cli_root/zpool_set/zpool_set_003_neg.ksh \ functional/cli_root/zpool_set/zpool_set_ashift.ksh \ functional/cli_root/zpool_set/user_property_001_pos.ksh \ functional/cli_root/zpool_set/user_property_002_neg.ksh \ functional/cli_root/zpool_set/zpool_set_features.ksh \ functional/cli_root/zpool_split/cleanup.ksh \ functional/cli_root/zpool_split/setup.ksh \ functional/cli_root/zpool_split/zpool_split_cliargs.ksh \ functional/cli_root/zpool_split/zpool_split_devices.ksh \ functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh \ functional/cli_root/zpool_split/zpool_split_encryption.ksh \ functional/cli_root/zpool_split/zpool_split_indirect.ksh \ functional/cli_root/zpool_split/zpool_split_props.ksh \ functional/cli_root/zpool_split/zpool_split_resilver.ksh \ functional/cli_root/zpool_split/zpool_split_vdevs.ksh \ functional/cli_root/zpool_split/zpool_split_wholedisk.ksh \ functional/cli_root/zpool_status/cleanup.ksh \ functional/cli_root/zpool_status/setup.ksh \ functional/cli_root/zpool_status/zpool_status_001_pos.ksh \ functional/cli_root/zpool_status/zpool_status_002_pos.ksh \ functional/cli_root/zpool_status/zpool_status_003_pos.ksh \ functional/cli_root/zpool_status/zpool_status_004_pos.ksh \ functional/cli_root/zpool_status/zpool_status_005_pos.ksh \ functional/cli_root/zpool_status/zpool_status_006_pos.ksh \ functional/cli_root/zpool_status/zpool_status_007_pos.ksh \ functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \ functional/cli_root/zpool_sync/cleanup.ksh \ functional/cli_root/zpool_sync/setup.ksh \ functional/cli_root/zpool_sync/zpool_sync_001_pos.ksh \ functional/cli_root/zpool_sync/zpool_sync_002_neg.ksh \ functional/cli_root/zpool_trim/cleanup.ksh \ functional/cli_root/zpool_trim/setup.ksh \ functional/cli_root/zpool_trim/zpool_trim_attach_detach_add_remove.ksh \ functional/cli_root/zpool_trim/zpool_trim_fault_export_import_online.ksh \ functional/cli_root/zpool_trim/zpool_trim_import_export.ksh \ functional/cli_root/zpool_trim/zpool_trim_multiple.ksh \ functional/cli_root/zpool_trim/zpool_trim_neg.ksh \ functional/cli_root/zpool_trim/zpool_trim_offline_export_import_online.ksh \ functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh \ functional/cli_root/zpool_trim/zpool_trim_partial.ksh \ functional/cli_root/zpool_trim/zpool_trim_rate.ksh \ functional/cli_root/zpool_trim/zpool_trim_rate_neg.ksh \ functional/cli_root/zpool_trim/zpool_trim_secure.ksh \ functional/cli_root/zpool_trim/zpool_trim_split.ksh \ functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh \ functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_pos.ksh \ functional/cli_root/zpool_trim/zpool_trim_suspend_resume.ksh \ functional/cli_root/zpool_trim/zpool_trim_unsupported_vdevs.ksh \ functional/cli_root/zpool_trim/zpool_trim_verify_checksums.ksh \ functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh \ functional/cli_root/zpool_upgrade/cleanup.ksh \ functional/cli_root/zpool_upgrade/setup.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_001_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_002_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_003_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_004_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_005_neg.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_006_neg.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_007_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_008_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_009_neg.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_features_001_pos.ksh \ functional/cli_root/zpool_wait/cleanup.ksh \ functional/cli_root/zpool_wait/scan/cleanup.ksh \ functional/cli_root/zpool_wait/scan/setup.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_replace.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_resilver.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_scrub_basic.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_scrub_cancel.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_scrub_flag.ksh \ functional/cli_root/zpool_wait/setup.ksh \ functional/cli_root/zpool_wait/zpool_wait_discard.ksh \ functional/cli_root/zpool_wait/zpool_wait_freeing.ksh \ functional/cli_root/zpool_wait/zpool_wait_initialize_basic.ksh \ functional/cli_root/zpool_wait/zpool_wait_initialize_cancel.ksh \ functional/cli_root/zpool_wait/zpool_wait_initialize_flag.ksh \ functional/cli_root/zpool_wait/zpool_wait_multiple.ksh \ functional/cli_root/zpool_wait/zpool_wait_no_activity.ksh \ functional/cli_root/zpool_wait/zpool_wait_remove_cancel.ksh \ functional/cli_root/zpool_wait/zpool_wait_remove.ksh \ functional/cli_root/zpool_wait/zpool_wait_trim_basic.ksh \ functional/cli_root/zpool_wait/zpool_wait_trim_cancel.ksh \ functional/cli_root/zpool_wait/zpool_wait_trim_flag.ksh \ functional/cli_root/zpool_wait/zpool_wait_usage.ksh \ functional/cli_root/zpool/zpool_001_neg.ksh \ functional/cli_root/zpool/zpool_002_pos.ksh \ functional/cli_root/zpool/zpool_003_pos.ksh \ functional/cli_root/zpool/zpool_colors.ksh \ functional/cli_user/misc/arcstat_001_pos.ksh \ functional/cli_user/misc/arc_summary_001_pos.ksh \ functional/cli_user/misc/arc_summary_002_neg.ksh \ functional/cli_user/misc/zilstat_001_pos.ksh \ functional/cli_user/misc/cleanup.ksh \ functional/cli_user/misc/setup.ksh \ functional/cli_user/misc/zdb_001_neg.ksh \ functional/cli_user/misc/zfs_001_neg.ksh \ functional/cli_user/misc/zfs_allow_001_neg.ksh \ functional/cli_user/misc/zfs_clone_001_neg.ksh \ functional/cli_user/misc/zfs_create_001_neg.ksh \ functional/cli_user/misc/zfs_destroy_001_neg.ksh \ functional/cli_user/misc/zfs_get_001_neg.ksh \ functional/cli_user/misc/zfs_inherit_001_neg.ksh \ functional/cli_user/misc/zfs_mount_001_neg.ksh \ functional/cli_user/misc/zfs_promote_001_neg.ksh \ functional/cli_user/misc/zfs_receive_001_neg.ksh \ functional/cli_user/misc/zfs_rename_001_neg.ksh \ functional/cli_user/misc/zfs_rollback_001_neg.ksh \ functional/cli_user/misc/zfs_send_001_neg.ksh \ functional/cli_user/misc/zfs_set_001_neg.ksh \ functional/cli_user/misc/zfs_share_001_neg.ksh \ functional/cli_user/misc/zfs_snapshot_001_neg.ksh \ functional/cli_user/misc/zfs_unallow_001_neg.ksh \ functional/cli_user/misc/zfs_unmount_001_neg.ksh \ functional/cli_user/misc/zfs_unshare_001_neg.ksh \ functional/cli_user/misc/zfs_upgrade_001_neg.ksh \ functional/cli_user/misc/zpool_001_neg.ksh \ functional/cli_user/misc/zpool_add_001_neg.ksh \ functional/cli_user/misc/zpool_attach_001_neg.ksh \ functional/cli_user/misc/zpool_clear_001_neg.ksh \ functional/cli_user/misc/zpool_create_001_neg.ksh \ functional/cli_user/misc/zpool_destroy_001_neg.ksh \ functional/cli_user/misc/zpool_detach_001_neg.ksh \ functional/cli_user/misc/zpool_export_001_neg.ksh \ functional/cli_user/misc/zpool_get_001_neg.ksh \ functional/cli_user/misc/zpool_history_001_neg.ksh \ functional/cli_user/misc/zpool_import_001_neg.ksh \ functional/cli_user/misc/zpool_import_002_neg.ksh \ functional/cli_user/misc/zpool_offline_001_neg.ksh \ functional/cli_user/misc/zpool_online_001_neg.ksh \ functional/cli_user/misc/zpool_remove_001_neg.ksh \ functional/cli_user/misc/zpool_replace_001_neg.ksh \ functional/cli_user/misc/zpool_scrub_001_neg.ksh \ functional/cli_user/misc/zpool_set_001_neg.ksh \ functional/cli_user/misc/zpool_status_001_neg.ksh \ functional/cli_user/misc/zpool_upgrade_001_neg.ksh \ functional/cli_user/misc/zpool_wait_privilege.ksh \ functional/cli_user/zfs_list/cleanup.ksh \ functional/cli_user/zfs_list/setup.ksh \ functional/cli_user/zfs_list/zfs_list_001_pos.ksh \ functional/cli_user/zfs_list/zfs_list_002_pos.ksh \ functional/cli_user/zfs_list/zfs_list_003_pos.ksh \ functional/cli_user/zfs_list/zfs_list_004_neg.ksh \ functional/cli_user/zfs_list/zfs_list_005_neg.ksh \ functional/cli_user/zfs_list/zfs_list_007_pos.ksh \ functional/cli_user/zfs_list/zfs_list_008_neg.ksh \ functional/cli_user/zpool_iostat/cleanup.ksh \ functional/cli_user/zpool_iostat/setup.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_001_neg.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_005_pos.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_-c_disable.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_-c_homedir.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_-c_searchpath.ksh \ functional/cli_user/zpool_list/cleanup.ksh \ functional/cli_user/zpool_list/setup.ksh \ functional/cli_user/zpool_list/zpool_list_001_pos.ksh \ functional/cli_user/zpool_list/zpool_list_002_neg.ksh \ functional/cli_user/zpool_status/cleanup.ksh \ functional/cli_user/zpool_status/setup.ksh \ functional/cli_user/zpool_status/zpool_status_003_pos.ksh \ functional/cli_user/zpool_status/zpool_status_-c_disable.ksh \ functional/cli_user/zpool_status/zpool_status_-c_homedir.ksh \ functional/cli_user/zpool_status/zpool_status_-c_searchpath.ksh \ functional/compression/cleanup.ksh \ functional/compression/compress_001_pos.ksh \ functional/compression/compress_002_pos.ksh \ functional/compression/compress_003_pos.ksh \ functional/compression/compress_004_pos.ksh \ functional/compression/compress_zstd_bswap.ksh \ functional/compression/l2arc_compressed_arc_disabled.ksh \ functional/compression/l2arc_compressed_arc.ksh \ functional/compression/l2arc_encrypted.ksh \ functional/compression/l2arc_encrypted_no_compressed_arc.ksh \ functional/compression/setup.ksh \ functional/cp_files/cleanup.ksh \ functional/cp_files/cp_files_001_pos.ksh \ functional/cp_files/setup.ksh \ functional/crtime/cleanup.ksh \ functional/crtime/crtime_001_pos.ksh \ functional/crtime/setup.ksh \ functional/ctime/cleanup.ksh \ functional/ctime/ctime_001_pos.ksh \ functional/ctime/setup.ksh \ functional/deadman/deadman_ratelimit.ksh \ functional/deadman/deadman_sync.ksh \ functional/deadman/deadman_zio.ksh \ functional/delegate/cleanup.ksh \ functional/delegate/setup.ksh \ functional/delegate/zfs_allow_001_pos.ksh \ functional/delegate/zfs_allow_002_pos.ksh \ functional/delegate/zfs_allow_003_pos.ksh \ functional/delegate/zfs_allow_004_pos.ksh \ functional/delegate/zfs_allow_005_pos.ksh \ functional/delegate/zfs_allow_006_pos.ksh \ functional/delegate/zfs_allow_007_pos.ksh \ functional/delegate/zfs_allow_008_pos.ksh \ functional/delegate/zfs_allow_009_neg.ksh \ functional/delegate/zfs_allow_010_pos.ksh \ functional/delegate/zfs_allow_011_neg.ksh \ functional/delegate/zfs_allow_012_neg.ksh \ functional/delegate/zfs_unallow_001_pos.ksh \ functional/delegate/zfs_unallow_002_pos.ksh \ functional/delegate/zfs_unallow_003_pos.ksh \ functional/delegate/zfs_unallow_004_pos.ksh \ functional/delegate/zfs_unallow_005_pos.ksh \ functional/delegate/zfs_unallow_006_pos.ksh \ functional/delegate/zfs_unallow_007_neg.ksh \ functional/delegate/zfs_unallow_008_neg.ksh \ functional/devices/cleanup.ksh \ functional/devices/devices_001_pos.ksh \ functional/devices/devices_002_neg.ksh \ functional/devices/devices_003_pos.ksh \ functional/devices/setup.ksh \ functional/dos_attributes/cleanup.ksh \ functional/dos_attributes/read_dos_attrs_001.ksh \ functional/dos_attributes/setup.ksh \ functional/dos_attributes/write_dos_attrs_001.ksh \ functional/events/cleanup.ksh \ functional/events/events_001_pos.ksh \ functional/events/events_002_pos.ksh \ functional/events/setup.ksh \ functional/events/zed_cksum_config.ksh \ functional/events/zed_cksum_reported.ksh \ functional/events/zed_fd_spill.ksh \ functional/events/zed_io_config.ksh \ functional/events/zed_rc_filter.ksh \ functional/exec/cleanup.ksh \ functional/exec/exec_001_pos.ksh \ functional/exec/exec_002_neg.ksh \ functional/exec/setup.ksh \ functional/fadvise/cleanup.ksh \ functional/fadvise/fadvise_sequential.ksh \ functional/fadvise/setup.ksh \ functional/fallocate/cleanup.ksh \ functional/fallocate/fallocate_prealloc.ksh \ functional/fallocate/fallocate_punch-hole.ksh \ functional/fallocate/fallocate_zero-range.ksh \ functional/fallocate/setup.ksh \ functional/fault/auto_offline_001_pos.ksh \ functional/fault/auto_online_001_pos.ksh \ functional/fault/auto_online_002_pos.ksh \ functional/fault/auto_replace_001_pos.ksh \ functional/fault/auto_spare_001_pos.ksh \ functional/fault/auto_spare_002_pos.ksh \ functional/fault/auto_spare_ashift.ksh \ functional/fault/auto_spare_multiple.ksh \ functional/fault/auto_spare_shared.ksh \ functional/fault/cleanup.ksh \ functional/fault/decompress_fault.ksh \ functional/fault/decrypt_fault.ksh \ functional/fault/scrub_after_resilver.ksh \ functional/fault/setup.ksh \ functional/fault/zpool_status_-s.ksh \ functional/features/async_destroy/async_destroy_001_pos.ksh \ functional/features/async_destroy/cleanup.ksh \ functional/features/async_destroy/setup.ksh \ functional/features/large_dnode/cleanup.ksh \ functional/features/large_dnode/large_dnode_001_pos.ksh \ functional/features/large_dnode/large_dnode_002_pos.ksh \ functional/features/large_dnode/large_dnode_003_pos.ksh \ functional/features/large_dnode/large_dnode_004_neg.ksh \ functional/features/large_dnode/large_dnode_005_pos.ksh \ functional/features/large_dnode/large_dnode_006_pos.ksh \ functional/features/large_dnode/large_dnode_007_neg.ksh \ functional/features/large_dnode/large_dnode_008_pos.ksh \ functional/features/large_dnode/large_dnode_009_pos.ksh \ functional/features/large_dnode/setup.ksh \ functional/grow/grow_pool_001_pos.ksh \ functional/grow/grow_replicas_001_pos.ksh \ functional/history/cleanup.ksh \ functional/history/history_001_pos.ksh \ functional/history/history_002_pos.ksh \ functional/history/history_003_pos.ksh \ functional/history/history_004_pos.ksh \ functional/history/history_005_neg.ksh \ functional/history/history_006_neg.ksh \ functional/history/history_007_pos.ksh \ functional/history/history_008_pos.ksh \ functional/history/history_009_pos.ksh \ functional/history/history_010_pos.ksh \ functional/history/setup.ksh \ functional/inheritance/cleanup.ksh \ functional/inheritance/inherit_001_pos.ksh \ functional/inuse/inuse_001_pos.ksh \ functional/inuse/inuse_003_pos.ksh \ functional/inuse/inuse_004_pos.ksh \ functional/inuse/inuse_005_pos.ksh \ functional/inuse/inuse_006_pos.ksh \ functional/inuse/inuse_007_pos.ksh \ functional/inuse/inuse_008_pos.ksh \ functional/inuse/inuse_009_pos.ksh \ functional/inuse/setup.ksh \ functional/io/cleanup.ksh \ functional/io/io_uring.ksh \ functional/io/libaio.ksh \ functional/io/mmap.ksh \ functional/io/posixaio.ksh \ functional/io/psync.ksh \ functional/io/setup.ksh \ functional/io/sync.ksh \ functional/l2arc/cleanup.ksh \ functional/l2arc/l2arc_arcstats_pos.ksh \ functional/l2arc/l2arc_l2miss_pos.ksh \ functional/l2arc/l2arc_mfuonly_pos.ksh \ functional/l2arc/persist_l2arc_001_pos.ksh \ functional/l2arc/persist_l2arc_002_pos.ksh \ functional/l2arc/persist_l2arc_003_neg.ksh \ functional/l2arc/persist_l2arc_004_pos.ksh \ functional/l2arc/persist_l2arc_005_pos.ksh \ functional/l2arc/setup.ksh \ functional/large_files/cleanup.ksh \ functional/large_files/large_files_001_pos.ksh \ functional/large_files/large_files_002_pos.ksh \ functional/large_files/setup.ksh \ functional/largest_pool/largest_pool_001_pos.ksh \ functional/libzfs/cleanup.ksh \ functional/libzfs/libzfs_input.ksh \ functional/libzfs/setup.ksh \ functional/limits/cleanup.ksh \ functional/limits/filesystem_count.ksh \ functional/limits/filesystem_limit.ksh \ functional/limits/setup.ksh \ functional/limits/snapshot_count.ksh \ functional/limits/snapshot_limit.ksh \ functional/link_count/cleanup.ksh \ functional/link_count/link_count_001.ksh \ functional/link_count/link_count_root_inode.ksh \ functional/link_count/setup.ksh \ functional/log_spacemap/log_spacemap_import_logs.ksh \ functional/migration/cleanup.ksh \ functional/migration/migration_001_pos.ksh \ functional/migration/migration_002_pos.ksh \ functional/migration/migration_003_pos.ksh \ functional/migration/migration_004_pos.ksh \ functional/migration/migration_005_pos.ksh \ functional/migration/migration_006_pos.ksh \ functional/migration/migration_007_pos.ksh \ functional/migration/migration_008_pos.ksh \ functional/migration/migration_009_pos.ksh \ functional/migration/migration_010_pos.ksh \ functional/migration/migration_011_pos.ksh \ functional/migration/migration_012_pos.ksh \ functional/migration/setup.ksh \ functional/mmap/cleanup.ksh \ functional/mmap/mmap_libaio_001_pos.ksh \ functional/mmap/mmap_mixed.ksh \ functional/mmap/mmap_read_001_pos.ksh \ functional/mmap/mmap_seek_001_pos.ksh \ functional/mmap/mmap_sync_001_pos.ksh \ functional/mmap/mmap_write_001_pos.ksh \ functional/mmap/setup.ksh \ functional/mmp/cleanup.ksh \ functional/mmp/mmp_active_import.ksh \ functional/mmp/mmp_exported_import.ksh \ functional/mmp/mmp_hostid.ksh \ functional/mmp/mmp_inactive_import.ksh \ functional/mmp/mmp_interval.ksh \ functional/mmp/mmp_on_off.ksh \ functional/mmp/mmp_on_thread.ksh \ functional/mmp/mmp_on_uberblocks.ksh \ functional/mmp/mmp_on_zdb.ksh \ functional/mmp/mmp_reset_interval.ksh \ functional/mmp/mmp_write_distribution.ksh \ functional/mmp/mmp_write_uberblocks.ksh \ functional/mmp/multihost_history.ksh \ functional/mmp/setup.ksh \ functional/mount/cleanup.ksh \ functional/mount/setup.ksh \ functional/mount/umount_001.ksh \ functional/mount/umountall_001.ksh \ functional/mount/umount_unlinked_drain.ksh \ functional/mv_files/cleanup.ksh \ functional/mv_files/mv_files_001_pos.ksh \ functional/mv_files/mv_files_002_pos.ksh \ functional/mv_files/random_creation.ksh \ functional/mv_files/setup.ksh \ functional/nestedfs/cleanup.ksh \ functional/nestedfs/nestedfs_001_pos.ksh \ functional/nestedfs/setup.ksh \ functional/nopwrite/cleanup.ksh \ functional/nopwrite/nopwrite_copies.ksh \ functional/nopwrite/nopwrite_mtime.ksh \ functional/nopwrite/nopwrite_negative.ksh \ functional/nopwrite/nopwrite_promoted_clone.ksh \ functional/nopwrite/nopwrite_recsize.ksh \ functional/nopwrite/nopwrite_sync.ksh \ functional/nopwrite/nopwrite_varying_compression.ksh \ functional/nopwrite/nopwrite_volume.ksh \ functional/nopwrite/setup.ksh \ functional/no_space/cleanup.ksh \ functional/no_space/enospc_001_pos.ksh \ functional/no_space/enospc_002_pos.ksh \ functional/no_space/enospc_003_pos.ksh \ functional/no_space/enospc_df.ksh \ functional/no_space/enospc_ganging.ksh \ functional/no_space/enospc_rm.ksh \ functional/no_space/setup.ksh \ functional/online_offline/cleanup.ksh \ functional/online_offline/online_offline_001_pos.ksh \ functional/online_offline/online_offline_002_neg.ksh \ functional/online_offline/online_offline_003_neg.ksh \ functional/online_offline/setup.ksh \ functional/pam/cleanup.ksh \ functional/pam/pam_basic.ksh \ functional/pam/pam_change_unmounted.ksh \ functional/pam/pam_nounmount.ksh \ functional/pam/pam_recursive.ksh \ functional/pam/pam_short_password.ksh \ functional/pam/setup.ksh \ functional/pool_checkpoint/checkpoint_after_rewind.ksh \ functional/pool_checkpoint/checkpoint_big_rewind.ksh \ functional/pool_checkpoint/checkpoint_capacity.ksh \ functional/pool_checkpoint/checkpoint_conf_change.ksh \ functional/pool_checkpoint/checkpoint_discard_busy.ksh \ functional/pool_checkpoint/checkpoint_discard.ksh \ functional/pool_checkpoint/checkpoint_discard_many.ksh \ functional/pool_checkpoint/checkpoint_indirect.ksh \ functional/pool_checkpoint/checkpoint_invalid.ksh \ functional/pool_checkpoint/checkpoint_lun_expsz.ksh \ functional/pool_checkpoint/checkpoint_open.ksh \ functional/pool_checkpoint/checkpoint_removal.ksh \ functional/pool_checkpoint/checkpoint_rewind.ksh \ functional/pool_checkpoint/checkpoint_ro_rewind.ksh \ functional/pool_checkpoint/checkpoint_sm_scale.ksh \ functional/pool_checkpoint/checkpoint_twice.ksh \ functional/pool_checkpoint/checkpoint_vdev_add.ksh \ functional/pool_checkpoint/checkpoint_zdb.ksh \ functional/pool_checkpoint/checkpoint_zhack_feat.ksh \ functional/pool_checkpoint/cleanup.ksh \ functional/pool_checkpoint/setup.ksh \ functional/pool_names/pool_names_001_pos.ksh \ functional/pool_names/pool_names_002_neg.ksh \ functional/poolversion/cleanup.ksh \ functional/poolversion/poolversion_001_pos.ksh \ functional/poolversion/poolversion_002_pos.ksh \ functional/poolversion/setup.ksh \ functional/privilege/cleanup.ksh \ functional/privilege/privilege_001_pos.ksh \ functional/privilege/privilege_002_pos.ksh \ functional/privilege/setup.ksh \ functional/procfs/cleanup.ksh \ functional/procfs/pool_state.ksh \ functional/procfs/procfs_list_basic.ksh \ functional/procfs/procfs_list_concurrent_readers.ksh \ functional/procfs/procfs_list_stale_read.ksh \ functional/procfs/setup.ksh \ functional/projectquota/cleanup.ksh \ functional/projectquota/projectid_001_pos.ksh \ functional/projectquota/projectid_002_pos.ksh \ functional/projectquota/projectid_003_pos.ksh \ functional/projectquota/projectquota_001_pos.ksh \ functional/projectquota/projectquota_002_pos.ksh \ functional/projectquota/projectquota_003_pos.ksh \ functional/projectquota/projectquota_004_neg.ksh \ functional/projectquota/projectquota_005_pos.ksh \ functional/projectquota/projectquota_006_pos.ksh \ functional/projectquota/projectquota_007_pos.ksh \ functional/projectquota/projectquota_008_pos.ksh \ functional/projectquota/projectquota_009_pos.ksh \ functional/projectquota/projectspace_001_pos.ksh \ functional/projectquota/projectspace_002_pos.ksh \ functional/projectquota/projectspace_003_pos.ksh \ functional/projectquota/projectspace_004_pos.ksh \ functional/projectquota/projecttree_001_pos.ksh \ functional/projectquota/projecttree_002_pos.ksh \ functional/projectquota/projecttree_003_neg.ksh \ functional/projectquota/setup.ksh \ functional/quota/cleanup.ksh \ functional/quota/quota_001_pos.ksh \ functional/quota/quota_002_pos.ksh \ functional/quota/quota_003_pos.ksh \ functional/quota/quota_004_pos.ksh \ functional/quota/quota_005_pos.ksh \ functional/quota/quota_006_neg.ksh \ functional/quota/setup.ksh \ functional/raidz/cleanup.ksh \ functional/raidz/raidz_001_neg.ksh \ functional/raidz/raidz_002_pos.ksh \ functional/raidz/raidz_003_pos.ksh \ functional/raidz/raidz_004_pos.ksh \ functional/raidz/setup.ksh \ functional/redacted_send/cleanup.ksh \ functional/redacted_send/redacted_compressed.ksh \ functional/redacted_send/redacted_contents.ksh \ functional/redacted_send/redacted_deleted.ksh \ functional/redacted_send/redacted_disabled_feature.ksh \ functional/redacted_send/redacted_embedded.ksh \ functional/redacted_send/redacted_holes.ksh \ functional/redacted_send/redacted_incrementals.ksh \ functional/redacted_send/redacted_largeblocks.ksh \ functional/redacted_send/redacted_many_clones.ksh \ functional/redacted_send/redacted_mixed_recsize.ksh \ functional/redacted_send/redacted_mounts.ksh \ functional/redacted_send/redacted_negative.ksh \ functional/redacted_send/redacted_origin.ksh \ functional/redacted_send/redacted_panic.ksh \ functional/redacted_send/redacted_props.ksh \ functional/redacted_send/redacted_resume.ksh \ functional/redacted_send/redacted_size.ksh \ functional/redacted_send/redacted_volume.ksh \ functional/redacted_send/setup.ksh \ functional/redundancy/cleanup.ksh \ functional/redundancy/redundancy_draid1.ksh \ functional/redundancy/redundancy_draid2.ksh \ functional/redundancy/redundancy_draid3.ksh \ functional/redundancy/redundancy_draid_damaged1.ksh \ functional/redundancy/redundancy_draid_damaged2.ksh \ functional/redundancy/redundancy_draid.ksh \ functional/redundancy/redundancy_draid_spare1.ksh \ functional/redundancy/redundancy_draid_spare2.ksh \ functional/redundancy/redundancy_draid_spare3.ksh \ functional/redundancy/redundancy_mirror.ksh \ functional/redundancy/redundancy_raidz1.ksh \ functional/redundancy/redundancy_raidz2.ksh \ functional/redundancy/redundancy_raidz3.ksh \ functional/redundancy/redundancy_raidz.ksh \ functional/redundancy/redundancy_stripe.ksh \ functional/redundancy/setup.ksh \ functional/refquota/cleanup.ksh \ functional/refquota/refquota_001_pos.ksh \ functional/refquota/refquota_002_pos.ksh \ functional/refquota/refquota_003_pos.ksh \ functional/refquota/refquota_004_pos.ksh \ functional/refquota/refquota_005_pos.ksh \ functional/refquota/refquota_006_neg.ksh \ functional/refquota/refquota_007_neg.ksh \ functional/refquota/refquota_008_neg.ksh \ functional/refquota/setup.ksh \ functional/refreserv/cleanup.ksh \ functional/refreserv/refreserv_001_pos.ksh \ functional/refreserv/refreserv_002_pos.ksh \ functional/refreserv/refreserv_003_pos.ksh \ functional/refreserv/refreserv_004_pos.ksh \ functional/refreserv/refreserv_005_pos.ksh \ functional/refreserv/refreserv_multi_raidz.ksh \ functional/refreserv/refreserv_raidz.ksh \ functional/refreserv/setup.ksh \ functional/removal/cleanup.ksh \ functional/removal/removal_all_vdev.ksh \ functional/removal/removal_cancel.ksh \ functional/removal/removal_check_space.ksh \ functional/removal/removal_condense_export.ksh \ functional/removal/removal_multiple_indirection.ksh \ functional/removal/removal_nopwrite.ksh \ functional/removal/removal_remap_deadlists.ksh \ functional/removal/removal_reservation.ksh \ functional/removal/removal_resume_export.ksh \ functional/removal/removal_sanity.ksh \ functional/removal/removal_with_add.ksh \ functional/removal/removal_with_create_fs.ksh \ functional/removal/removal_with_dedup.ksh \ functional/removal/removal_with_errors.ksh \ functional/removal/removal_with_export.ksh \ functional/removal/removal_with_faulted.ksh \ functional/removal/removal_with_ganging.ksh \ functional/removal/removal_with_indirect.ksh \ functional/removal/removal_with_remove.ksh \ functional/removal/removal_with_scrub.ksh \ functional/removal/removal_with_send.ksh \ functional/removal/removal_with_send_recv.ksh \ functional/removal/removal_with_snapshot.ksh \ functional/removal/removal_with_write.ksh \ functional/removal/removal_with_zdb.ksh \ functional/removal/remove_attach_mirror.ksh \ functional/removal/remove_expanded.ksh \ functional/removal/remove_indirect.ksh \ functional/removal/remove_mirror.ksh \ functional/removal/remove_mirror_sanity.ksh \ functional/removal/remove_raidz.ksh \ functional/rename_dirs/cleanup.ksh \ functional/rename_dirs/rename_dirs_001_pos.ksh \ functional/rename_dirs/setup.ksh \ functional/renameat2/cleanup.ksh \ functional/renameat2/setup.ksh \ functional/renameat2/renameat2_exchange.ksh \ functional/renameat2/renameat2_noreplace.ksh \ functional/renameat2/renameat2_whiteout.ksh \ functional/replacement/attach_import.ksh \ functional/replacement/attach_multiple.ksh \ functional/replacement/attach_rebuild.ksh \ functional/replacement/attach_resilver.ksh \ functional/replacement/cleanup.ksh \ functional/replacement/detach.ksh \ functional/replacement/rebuild_disabled_feature.ksh \ functional/replacement/rebuild_multiple.ksh \ functional/replacement/rebuild_raidz.ksh \ functional/replacement/replace_import.ksh \ functional/replacement/replace_rebuild.ksh \ functional/replacement/replace_resilver.ksh \ functional/replacement/resilver_restart_001.ksh \ functional/replacement/resilver_restart_002.ksh \ functional/replacement/scrub_cancel.ksh \ functional/replacement/setup.ksh \ functional/reservation/cleanup.ksh \ functional/reservation/reservation_001_pos.ksh \ functional/reservation/reservation_002_pos.ksh \ functional/reservation/reservation_003_pos.ksh \ functional/reservation/reservation_004_pos.ksh \ functional/reservation/reservation_005_pos.ksh \ functional/reservation/reservation_006_pos.ksh \ functional/reservation/reservation_007_pos.ksh \ functional/reservation/reservation_008_pos.ksh \ functional/reservation/reservation_009_pos.ksh \ functional/reservation/reservation_010_pos.ksh \ functional/reservation/reservation_011_pos.ksh \ functional/reservation/reservation_012_pos.ksh \ functional/reservation/reservation_013_pos.ksh \ functional/reservation/reservation_014_pos.ksh \ functional/reservation/reservation_015_pos.ksh \ functional/reservation/reservation_016_pos.ksh \ functional/reservation/reservation_017_pos.ksh \ functional/reservation/reservation_018_pos.ksh \ functional/reservation/reservation_019_pos.ksh \ functional/reservation/reservation_020_pos.ksh \ functional/reservation/reservation_021_neg.ksh \ functional/reservation/reservation_022_pos.ksh \ functional/reservation/setup.ksh \ functional/rootpool/cleanup.ksh \ functional/rootpool/rootpool_002_neg.ksh \ functional/rootpool/rootpool_003_neg.ksh \ functional/rootpool/rootpool_007_pos.ksh \ functional/rootpool/setup.ksh \ functional/rsend/cleanup.ksh \ functional/rsend/recv_dedup_encrypted_zvol.ksh \ functional/rsend/recv_dedup.ksh \ functional/rsend/rsend_001_pos.ksh \ functional/rsend/rsend_002_pos.ksh \ functional/rsend/rsend_003_pos.ksh \ functional/rsend/rsend_004_pos.ksh \ functional/rsend/rsend_005_pos.ksh \ functional/rsend/rsend_006_pos.ksh \ functional/rsend/rsend_007_pos.ksh \ functional/rsend/rsend_008_pos.ksh \ functional/rsend/rsend_009_pos.ksh \ functional/rsend/rsend_010_pos.ksh \ functional/rsend/rsend_011_pos.ksh \ functional/rsend/rsend_012_pos.ksh \ functional/rsend/rsend_013_pos.ksh \ functional/rsend/rsend_014_pos.ksh \ functional/rsend/rsend_016_neg.ksh \ functional/rsend/rsend_019_pos.ksh \ functional/rsend/rsend_020_pos.ksh \ functional/rsend/rsend_021_pos.ksh \ functional/rsend/rsend_022_pos.ksh \ functional/rsend/rsend_024_pos.ksh \ functional/rsend/rsend_025_pos.ksh \ functional/rsend/rsend_026_neg.ksh \ functional/rsend/rsend_027_pos.ksh \ functional/rsend/rsend_028_neg.ksh \ functional/rsend/rsend_029_neg.ksh \ functional/rsend/rsend_030_pos.ksh \ functional/rsend/rsend_031_pos.ksh \ functional/rsend/send-c_embedded_blocks.ksh \ functional/rsend/send-c_incremental.ksh \ functional/rsend/send-c_lz4_disabled.ksh \ functional/rsend/send-c_mixed_compression.ksh \ functional/rsend/send-c_props.ksh \ functional/rsend/send-c_recv_dedup.ksh \ functional/rsend/send-c_recv_lz4_disabled.ksh \ functional/rsend/send-c_resume.ksh \ functional/rsend/send-c_stream_size_estimate.ksh \ functional/rsend/send-c_verify_contents.ksh \ functional/rsend/send-c_verify_ratio.ksh \ functional/rsend/send-c_volume.ksh \ functional/rsend/send-c_zstream_recompress.ksh \ functional/rsend/send-c_zstreamdump.ksh \ functional/rsend/send-cpL_varied_recsize.ksh \ functional/rsend/send_doall.ksh \ functional/rsend/send_encrypted_incremental.ksh \ functional/rsend/send_encrypted_files.ksh \ functional/rsend/send_encrypted_freeobjects.ksh \ functional/rsend/send_encrypted_hierarchy.ksh \ functional/rsend/send_encrypted_props.ksh \ functional/rsend/send_encrypted_truncated_files.ksh \ functional/rsend/send_freeobjects.ksh \ functional/rsend/send_holds.ksh \ functional/rsend/send_hole_birth.ksh \ functional/rsend/send_invalid.ksh \ functional/rsend/send-L_toggle.ksh \ functional/rsend/send_mixed_raw.ksh \ functional/rsend/send_partial_dataset.ksh \ functional/rsend/send_raw_ashift.ksh \ functional/rsend/send_raw_spill_block.ksh \ functional/rsend/send_raw_large_blocks.ksh \ functional/rsend/send_realloc_dnode_size.ksh \ functional/rsend/send_realloc_encrypted_files.ksh \ functional/rsend/send_realloc_files.ksh \ functional/rsend/send_spill_block.ksh \ functional/rsend/send-wR_encrypted_zvol.ksh \ functional/rsend/setup.ksh \ functional/scrub_mirror/cleanup.ksh \ functional/scrub_mirror/scrub_mirror_001_pos.ksh \ functional/scrub_mirror/scrub_mirror_002_pos.ksh \ functional/scrub_mirror/scrub_mirror_003_pos.ksh \ functional/scrub_mirror/scrub_mirror_004_pos.ksh \ functional/scrub_mirror/setup.ksh \ functional/slog/cleanup.ksh \ functional/slog/setup.ksh \ functional/slog/slog_001_pos.ksh \ functional/slog/slog_002_pos.ksh \ functional/slog/slog_003_pos.ksh \ functional/slog/slog_004_pos.ksh \ functional/slog/slog_005_pos.ksh \ functional/slog/slog_006_pos.ksh \ functional/slog/slog_007_pos.ksh \ functional/slog/slog_008_neg.ksh \ functional/slog/slog_009_neg.ksh \ functional/slog/slog_010_neg.ksh \ functional/slog/slog_011_neg.ksh \ functional/slog/slog_012_neg.ksh \ functional/slog/slog_013_pos.ksh \ functional/slog/slog_014_pos.ksh \ functional/slog/slog_015_neg.ksh \ functional/slog/slog_016_pos.ksh \ functional/slog/slog_replay_fs_001.ksh \ functional/slog/slog_replay_fs_002.ksh \ functional/slog/slog_replay_volume.ksh \ functional/snapshot/cleanup.ksh \ functional/snapshot/clone_001_pos.ksh \ functional/snapshot/rollback_001_pos.ksh \ functional/snapshot/rollback_002_pos.ksh \ functional/snapshot/rollback_003_pos.ksh \ functional/snapshot/setup.ksh \ functional/snapshot/snapshot_001_pos.ksh \ functional/snapshot/snapshot_002_pos.ksh \ functional/snapshot/snapshot_003_pos.ksh \ functional/snapshot/snapshot_004_pos.ksh \ functional/snapshot/snapshot_005_pos.ksh \ functional/snapshot/snapshot_006_pos.ksh \ functional/snapshot/snapshot_007_pos.ksh \ functional/snapshot/snapshot_008_pos.ksh \ functional/snapshot/snapshot_009_pos.ksh \ functional/snapshot/snapshot_010_pos.ksh \ functional/snapshot/snapshot_011_pos.ksh \ functional/snapshot/snapshot_012_pos.ksh \ functional/snapshot/snapshot_013_pos.ksh \ functional/snapshot/snapshot_014_pos.ksh \ functional/snapshot/snapshot_015_pos.ksh \ functional/snapshot/snapshot_016_pos.ksh \ functional/snapshot/snapshot_017_pos.ksh \ functional/snapshot/snapshot_018_pos.ksh \ functional/snapused/cleanup.ksh \ functional/snapused/setup.ksh \ functional/snapused/snapused_001_pos.ksh \ functional/snapused/snapused_002_pos.ksh \ functional/snapused/snapused_003_pos.ksh \ functional/snapused/snapused_004_pos.ksh \ functional/snapused/snapused_005_pos.ksh \ functional/sparse/cleanup.ksh \ functional/sparse/setup.ksh \ functional/sparse/sparse_001_pos.ksh \ functional/stat/cleanup.ksh \ functional/stat/setup.ksh \ functional/stat/stat_001_pos.ksh \ functional/suid/cleanup.ksh \ functional/suid/setup.ksh \ functional/suid/suid_write_to_none.ksh \ functional/suid/suid_write_to_sgid.ksh \ functional/suid/suid_write_to_suid.ksh \ functional/suid/suid_write_to_suid_sgid.ksh \ functional/suid/suid_write_zil_replay.ksh \ functional/trim/autotrim_config.ksh \ functional/trim/autotrim_integrity.ksh \ functional/trim/autotrim_trim_integrity.ksh \ functional/trim/cleanup.ksh \ functional/trim/setup.ksh \ functional/trim/trim_config.ksh \ functional/trim/trim_integrity.ksh \ functional/trim/trim_l2arc.ksh \ functional/truncate/cleanup.ksh \ functional/truncate/setup.ksh \ functional/truncate/truncate_001_pos.ksh \ functional/truncate/truncate_002_pos.ksh \ functional/truncate/truncate_timestamps.ksh \ functional/upgrade/cleanup.ksh \ functional/upgrade/setup.ksh \ functional/upgrade/upgrade_projectquota_001_pos.ksh \ functional/upgrade/upgrade_readonly_pool.ksh \ functional/upgrade/upgrade_userobj_001_pos.ksh \ functional/user_namespace/cleanup.ksh \ functional/user_namespace/setup.ksh \ functional/user_namespace/user_namespace_001.ksh \ functional/user_namespace/user_namespace_002.ksh \ functional/user_namespace/user_namespace_003.ksh \ functional/user_namespace/user_namespace_004.ksh \ functional/userquota/cleanup.ksh \ functional/userquota/groupspace_001_pos.ksh \ functional/userquota/groupspace_002_pos.ksh \ functional/userquota/groupspace_003_pos.ksh \ functional/userquota/setup.ksh \ functional/userquota/userquota_001_pos.ksh \ functional/userquota/userquota_002_pos.ksh \ functional/userquota/userquota_003_pos.ksh \ functional/userquota/userquota_004_pos.ksh \ functional/userquota/userquota_005_neg.ksh \ functional/userquota/userquota_006_pos.ksh \ functional/userquota/userquota_007_pos.ksh \ functional/userquota/userquota_008_pos.ksh \ functional/userquota/userquota_009_pos.ksh \ functional/userquota/userquota_010_pos.ksh \ functional/userquota/userquota_011_pos.ksh \ functional/userquota/userquota_012_neg.ksh \ functional/userquota/userquota_013_pos.ksh \ functional/userquota/userspace_001_pos.ksh \ functional/userquota/userspace_002_pos.ksh \ functional/userquota/userspace_003_pos.ksh \ functional/userquota/userspace_encrypted.ksh \ functional/userquota/userspace_send_encrypted.ksh \ functional/userquota/userspace_encrypted_13709.ksh \ functional/vdev_zaps/cleanup.ksh \ functional/vdev_zaps/setup.ksh \ functional/vdev_zaps/vdev_zaps_001_pos.ksh \ functional/vdev_zaps/vdev_zaps_002_pos.ksh \ functional/vdev_zaps/vdev_zaps_003_pos.ksh \ functional/vdev_zaps/vdev_zaps_004_pos.ksh \ functional/vdev_zaps/vdev_zaps_005_pos.ksh \ functional/vdev_zaps/vdev_zaps_006_pos.ksh \ functional/vdev_zaps/vdev_zaps_007_pos.ksh \ functional/write_dirs/cleanup.ksh \ functional/write_dirs/setup.ksh \ functional/write_dirs/write_dirs_001_pos.ksh \ functional/write_dirs/write_dirs_002_pos.ksh \ functional/xattr/cleanup.ksh \ functional/xattr/setup.ksh \ functional/xattr/xattr_001_pos.ksh \ functional/xattr/xattr_002_neg.ksh \ functional/xattr/xattr_003_neg.ksh \ functional/xattr/xattr_004_pos.ksh \ functional/xattr/xattr_005_pos.ksh \ functional/xattr/xattr_006_pos.ksh \ functional/xattr/xattr_007_neg.ksh \ functional/xattr/xattr_008_pos.ksh \ functional/xattr/xattr_009_neg.ksh \ functional/xattr/xattr_010_neg.ksh \ functional/xattr/xattr_011_pos.ksh \ functional/xattr/xattr_012_pos.ksh \ functional/xattr/xattr_013_pos.ksh \ functional/xattr/xattr_compat.ksh \ functional/zpool_influxdb/cleanup.ksh \ functional/zpool_influxdb/setup.ksh \ functional/zpool_influxdb/zpool_influxdb.ksh \ functional/zvol/zvol_cli/cleanup.ksh \ functional/zvol/zvol_cli/setup.ksh \ functional/zvol/zvol_cli/zvol_cli_001_pos.ksh \ functional/zvol/zvol_cli/zvol_cli_002_pos.ksh \ functional/zvol/zvol_cli/zvol_cli_003_neg.ksh \ functional/zvol/zvol_ENOSPC/cleanup.ksh \ functional/zvol/zvol_ENOSPC/setup.ksh \ functional/zvol/zvol_ENOSPC/zvol_ENOSPC_001_pos.ksh \ functional/zvol/zvol_misc/cleanup.ksh \ functional/zvol/zvol_misc/setup.ksh \ functional/zvol/zvol_misc/zvol_misc_001_neg.ksh \ functional/zvol/zvol_misc/zvol_misc_002_pos.ksh \ functional/zvol/zvol_misc/zvol_misc_003_neg.ksh \ functional/zvol/zvol_misc/zvol_misc_004_pos.ksh \ functional/zvol/zvol_misc/zvol_misc_005_neg.ksh \ functional/zvol/zvol_misc/zvol_misc_006_pos.ksh \ functional/zvol/zvol_misc/zvol_misc_fua.ksh \ functional/zvol/zvol_misc/zvol_misc_hierarchy.ksh \ functional/zvol/zvol_misc/zvol_misc_rename_inuse.ksh \ functional/zvol/zvol_misc/zvol_misc_snapdev.ksh \ functional/zvol/zvol_misc/zvol_misc_trim.ksh \ functional/zvol/zvol_misc/zvol_misc_volmode.ksh \ functional/zvol/zvol_misc/zvol_misc_zil.ksh \ functional/zvol/zvol_stress/cleanup.ksh \ functional/zvol/zvol_stress/setup.ksh \ functional/zvol/zvol_stress/zvol_stress.ksh \ functional/zvol/zvol_swap/cleanup.ksh \ functional/zvol/zvol_swap/setup.ksh \ functional/zvol/zvol_swap/zvol_swap_001_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_002_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_003_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_004_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_005_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_006_pos.ksh \ functional/idmap_mount/cleanup.ksh \ functional/idmap_mount/setup.ksh \ functional/idmap_mount/idmap_mount_001.ksh \ functional/idmap_mount/idmap_mount_002.ksh \ functional/idmap_mount/idmap_mount_003.ksh \ functional/idmap_mount/idmap_mount_004.ksh \ functional/idmap_mount/idmap_mount_005.ksh diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh index 2cd2f4763a73..e52b34ec8a51 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh @@ -1,66 +1,68 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or https://opensource.org/licenses/CDDL-1.0. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright (c) 2023, Klara Inc. # Copyright (c) 2023, Rob Norris # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib verify_runnable "global" if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then log_unsupported "copy_file_range not available before Linux 4.5" fi claim="copy_file_range will fall back to copy when cloning on same txg" log_assert $claim typeset timeout=$(get_tunable TXG_TIMEOUT) function cleanup { datasetexists $TESTPOOL && destroy_pool $TESTPOOL set_tunable64 TXG_TIMEOUT $timeout } log_onexit cleanup log_must set_tunable64 TXG_TIMEOUT 5000 log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS +log_must sync_pool $TESTPOOL true + log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4 log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288 log_must sync_pool $TESTPOOL log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone) log_must [ "$blocks" = "" ] log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg index 4a9fb5e7489a..cf9c6a8499af 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg @@ -1,65 +1,70 @@ # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or https://opensource.org/licenses/CDDL-1.0. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # # Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. # . $STF_SUITE/include/libtest.shlib export DISK=${DISKS%% *} export FS_SIZE="$((($MINVDEVSIZE / (1024 * 1024)) * 32))m" export FILE_SIZE="$((MINVDEVSIZE))" export SLICE_SIZE="$((($MINVDEVSIZE / (1024 * 1024)) * 2))m" export MAX_NUM=6 export DEVICE_DIR=$TEST_BASE_DIR/dev_import-test export BACKUP_DEVICE_DIR=$TEST_BASE_DIR/bakdev_import-test export DEVICE_FILE=disk export DEVICE_ARCHIVE=archive_import-test export MYTESTFILE=$STF_SUITE/include/libtest.shlib export CPATH=$TEST_BASE_DIR/cachefile.$$ export CPATHBKP=$TEST_BASE_DIR/cachefile.$$.bkp export CPATHBKP2=$TEST_BASE_DIR/cachefile.$$.bkp2 export MD5FILE=$TEST_BASE_DIR/md5sums.$$ export MD5FILE2=$TEST_BASE_DIR/md5sums.$$.2 export GROUP_NUM=3 typeset -i num=0 while (( num < $GROUP_NUM )); do DEVICE_FILES="$DEVICE_FILES ${DEVICE_DIR}/${DEVICE_FILE}$num" (( num = num + 1 )) done export DEVICE_FILES export VDEV0=$DEVICE_DIR/${DEVICE_FILE}0 export VDEV1=$DEVICE_DIR/${DEVICE_FILE}1 export VDEV2=$DEVICE_DIR/${DEVICE_FILE}2 export VDEV3=$DEVICE_DIR/${DEVICE_FILE}3 export VDEV4=$DEVICE_DIR/${DEVICE_FILE}4 export VDEV5=$DEVICE_DIR/${DEVICE_FILE}5 export ALTER_ROOT=/alter_import-test + +export HOSTID_FILE="/etc/hostid" +export HOSTID1=01234567 +export HOSTID2=89abcdef diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib index 559810ff0e30..50157fa80578 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib @@ -1,347 +1,348 @@ # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2016 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg # # Prototype cleanup function for zpool_import tests. # function cleanup { # clear any remaining zinjections log_must eval "zinject -c all > /dev/null" destroy_pool $TESTPOOL1 log_must rm -f $CPATH $CPATHBKP $CPATHBKP2 $MD5FILE $MD5FILE2 log_must rm -rf $DEVICE_DIR/* typeset i=0 while (( i < $MAX_NUM )); do log_must truncate -s $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i ((i += 1)) done is_linux && set_tunable32 TXG_HISTORY 0 } # # Write a bit of data and sync several times. # This function is intended to be used by zpool rewind tests. # function sync_some_data_a_few_times { typeset pool=$1 typeset -i a_few_times=${2:-10} typeset file="/$pool/tmpfile" for i in {0..$a_few_times}; do dd if=/dev/urandom of=${file}_$i bs=128k count=10 sync_pool "$pool" done return 0 } # # Just write a moderate amount of data to the pool. # function write_some_data { typeset pool=$1 typeset files10mb=${2:-10} typeset ds="$pool/fillerds" zfs create $ds || return 1 # Create 100 MB of data typeset file="/$ds/fillerfile" for i in {1..$files10mb}; do dd if=/dev/urandom of=$file.$i bs=128k count=80 || return 1 done } # # Create/overwrite a few datasets with files. # Checksum all the files and store digests in a file. # # newdata: overwrite existing files if false. # md5file: file where to store md5 digests # datasetname: base name for datasets # function _generate_data_common { typeset pool=$1 typeset newdata=$2 typeset md5file=$3 typeset datasetname=$4 typeset -i datasets=3 typeset -i files=5 typeset -i blocks=10 [[ -n $md5file ]] && rm -f $md5file for i in {1..$datasets}; do ( $newdata ) && log_must zfs create "$pool/$datasetname$i" for j in {1..$files}; do typeset file="/$pool/$datasetname$i/file$j" dd if=/dev/urandom of=$file bs=128k count=$blocks > /dev/null if [[ -n $md5file ]]; then typeset cksum=$(md5digest $file) echo $cksum $file >> $md5file fi done ( $newdata ) && sync_pool "$pool" done return 0 } function generate_data { typeset pool=$1 typeset md5file="$2" typeset datasetname=${3:-ds} _generate_data_common $pool true "$md5file" $datasetname } function overwrite_data { typeset pool=$1 typeset md5file="$2" typeset datasetname=${3:-ds} _generate_data_common $1 false "$md5file" $datasetname } # # Verify md5sums of every file in md5sum file $1. # function verify_data_md5sums { typeset md5file=$1 if [[ ! -f $md5file ]]; then log_note "md5 sums file '$md5file' doesn't exist" return 1 fi while read -r digest file; do typeset digest1=$(md5digest $file) if [[ "$digest1" != "$digest" ]]; then return 1 fi done < $md5file return 0 } # # Set devices size in DEVICE_DIR to $1. # function increase_device_sizes { typeset newfilesize=$1 typeset -i i=0 while (( i < $MAX_NUM )); do log_must truncate -s $newfilesize ${DEVICE_DIR}/${DEVICE_FILE}$i ((i += 1)) done } # # Translate vdev names returned by zpool status into more generic names. # function _translate_vdev { typeset vdev=$1 # # eg: mirror-2 --> mirror # eg: draid2:4d:12c:1s-0 --> draid2 # typeset keywords="mirror replacing raidz1 raidz2 raidz3 indirect draid1 draid2 draid3" for word in $keywords; do if echo $vdev | grep -qE "^${word}-[0-9]+\$|^${word}:[0-9]+d:[0-9]c:[0-9]+s-[0-9]+\$" then vdev=$word break fi done case "$vdev" in logs) echo "log" ;; raidz1) echo "raidz" ;; draid1) echo "draid" ;; *) echo $vdev ;; esac } # # Check that pool configuration returned by zpool status matches expected # configuration. Format for the check string is same as the vdev arguments for # creating a pool # Add -q for quiet mode. # # eg: check_pool_config pool1 "mirror c0t0d0s0 c0t1d0s0 log c1t1d0s0" # function check_pool_config { typeset logfailure=true if [[ $1 == '-q' ]]; then logfailure=false shift fi typeset poolname=$1 typeset expected=$2 typeset status if ! status=$(zpool status $poolname 2>&1); then if $logfailure; then log_note "zpool status $poolname failed: $status" fi return 1 fi typeset actual="" typeset began=false while read -r vdev _; do if ( ! $began ) && [[ $vdev == NAME ]]; then began=true continue fi ( $began ) && [[ -z $vdev ]] && break; if ( $began ); then [[ -z $actual ]] && actual="$vdev" && continue vdev=$(_translate_vdev $vdev) actual="$actual $vdev" fi done <<<"$status" expected="$poolname $expected" if [[ "$actual" != "$expected" ]]; then if $logfailure; then log_note "expected pool vdevs:" log_note "> '$expected'" log_note "actual pool vdevs:" log_note "> '$actual'" fi return 1 fi return 0 } # # Check that pool configuration returned by zpool status matches expected # configuration within a given timeout in seconds. See check_pool_config(). # # eg: wait_for_pool_config pool1 "mirror c0t0d0s0 c0t1d0s0" 60 # function wait_for_pool_config { typeset poolname=$1 typeset expectedconfig="$2" typeset -i timeout=${3:-60} timeout=$(( $timeout + $(date +%s) )) while (( $(date +%s) < $timeout )); do check_pool_config -q $poolname "$expectedconfig" && return 0 sleep 3 done check_pool_config $poolname "$expectedconfig" } # # Check that pool status is ONLINE # function check_pool_healthy { typeset pool=$1 typeset status if ! status=$(zpool status $pool 2>&1); then log_note "zpool status $pool failed: $status" return 1 fi status=$(echo "$status" | awk -v p="$pool" '!/pool:/ && $0 ~ p {print $2}') if [[ $status != "ONLINE" ]]; then log_note "Invalid zpool status for '$pool': '$status'" \ "!= 'ONLINE'" return 1 fi return 0 } # # Return 0 if a device is currently being replaced in the pool. # function pool_is_replacing { typeset pool=$1 zpool status $pool | grep "replacing" | grep -q "ONLINE" } function set_vdev_validate_skip { set_tunable32 VDEV_VALIDATE_SKIP "$1" } function get_zfs_txg_timeout { get_tunable TXG_TIMEOUT } function set_zfs_txg_timeout { set_tunable32 TXG_TIMEOUT "$1" } function set_spa_load_verify_metadata { set_tunable32 SPA_LOAD_VERIFY_METADATA "$1" } function set_spa_load_verify_data { set_tunable32 SPA_LOAD_VERIFY_DATA "$1" } function set_zfs_max_missing_tvds { set_tunable32 MAX_MISSING_TVDS "$1" } # # Use zdb to find the last txg that was synced in an active pool. # function get_last_txg_synced { typeset pool=$1 zdb -u $pool | awk '$1 == "txg" { print $3 }' | sort -n | tail -n 1 } diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh new file mode 100755 index 000000000000..bc82b7cc1ee8 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh @@ -0,0 +1,59 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that was cleanly exported should be importable without force even if +# the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool. +# 3. Export the pool. +# 4. Change the hostid. +# 5. Verify that importing the pool without force succeeds. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create $TESTPOOL1 $VDEV0 + +# 3. Export the pool. +log_must zpool export $TESTPOOL1 + +# 4. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 5. Verify that importing the pool without force succeeds. +log_must zpool import -d $DEVICE_DIR $TESTPOOL1 + +log_pass "zpool import can import cleanly exported pool when hostid changes." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh new file mode 100755 index 000000000000..07c43482d68f --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh @@ -0,0 +1,65 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that was cleanly exported should be importable from a cachefile +# without force even if the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool with a cachefile. +# 3. Backup the cachfile. +# 4. Export the pool. +# 5. Change the hostid. +# 6. Verify that importing the pool from the cachefile succeeds +# without force. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE $CPATH $CPATHBKP + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $VDEV0 + +# 3. Backup the cachfile. +log_must cp $CPATH $CPATHBKP + +# 4. Export the pool. +log_must zpool export $TESTPOOL1 + +# 5. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 6. Verify that importing the pool from the cachefile succeeds without force. +log_must zpool import -c $CPATHBKP $TESTPOOL1 + +log_pass "zpool import can import cleanly exported pool from cachefile " \ + "when hostid changes." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh new file mode 100755 index 000000000000..dcb1ac1ab69f --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh @@ -0,0 +1,75 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that wasn't cleanly exported should not be importable from a cachefile +# without force if the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool. +# 3. Backup the cachefile. +# 4. Simulate the pool being torn down without export: +# 4.1. Copy the underlying device state. +# 4.2. Export the pool. +# 4.3. Restore the device state from the copy. +# 5. Change the hostid. +# 6. Verify that importing the pool from the cachefile fails. +# 7. Verify that importing the pool from the cachefile with force +# succeeds. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE $CPATH $CPATHBKP $VDEV0.bak + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $VDEV0 + +# 3. Backup the cachfile. +log_must cp $CPATH $CPATHBKP + +# 4. Simulate the pool being torn down without export. +log_must cp $VDEV0 $VDEV0.bak +log_must zpool export $TESTPOOL1 +log_must cp -f $VDEV0.bak $VDEV0 +log_must rm -f $VDEV0.bak + +# 5. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 6. Verify that importing the pool from the cachefile fails. +log_mustnot zpool import -c $CPATHBKP $TESTPOOL1 + +# 7. Verify that importing the pool from the cachefile with force succeeds. +log_must zpool import -f -c $CPATHBKP $TESTPOOL1 + +log_pass "zpool import from cachefile requires force if not cleanly " \ + "exported and hostid changes." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh new file mode 100755 index 000000000000..ad8cca642dbc --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh @@ -0,0 +1,70 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that wasn't cleanly exported should not be importable without force if +# the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool. +# 3. Simulate the pool being torn down without export: +# 3.1. Copy the underlying device state. +# 3.2. Export the pool. +# 3.3. Restore the device state from the copy. +# 4. Change the hostid. +# 5. Verify that importing the pool fails. +# 6. Verify that importing the pool with force succeeds. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE $VDEV0.bak + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create $TESTPOOL1 $VDEV0 + +# 3. Simulate the pool being torn down without export. +log_must cp $VDEV0 $VDEV0.bak +log_must zpool export $TESTPOOL1 +log_must cp -f $VDEV0.bak $VDEV0 +log_must rm -f $VDEV0.bak + +# 4. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 5. Verify that importing the pool fails. +log_mustnot zpool import -d $DEVICE_DIR $TESTPOOL1 + +# 6. Verify that importing the pool with force succeeds. +log_must zpool import -d $DEVICE_DIR -f $TESTPOOL1 + +log_pass "zpool import requires force if not cleanly exported " \ + "and hostid changed."